From 5d48dccb9b5ebca755fdb3025924ae0e15bf18ca Mon Sep 17 00:00:00 2001 From: dilip kumar Date: Sat, 25 Jun 2022 15:12:27 +0530 Subject: [PATCH v4 3/4] Use 56 bits for relfilenumber to avoid wraparound MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As part of this patch, we will make the relfilenumber 56 bits wide. But the problem is that if we make it 56 bits wide then the size of the BufferTag will be increased which will increase the memory usage and that may also impact the performance. So in order to avoid that inside the buffer tag, instead of using 64 bits for the relfilenumber we will use 8 bits for the fork number and 56 bits for the relfilenumber. --- contrib/pg_buffercache/Makefile | 3 +- .../pg_buffercache/pg_buffercache--1.3--1.4.sql | 30 +++++++ contrib/pg_buffercache/pg_buffercache.control | 2 +- contrib/pg_buffercache/pg_buffercache_pages.c | 31 ++++++- contrib/pg_walinspect/expected/pg_walinspect.out | 4 +- contrib/pg_walinspect/sql/pg_walinspect.sql | 4 +- doc/src/sgml/catalogs.sgml | 2 +- doc/src/sgml/pgbuffercache.sgml | 2 +- src/backend/access/gin/ginxlog.c | 2 +- src/backend/access/rmgrdesc/gistdesc.c | 2 +- src/backend/access/rmgrdesc/heapdesc.c | 2 +- src/backend/access/rmgrdesc/nbtdesc.c | 2 +- src/backend/access/rmgrdesc/seqdesc.c | 2 +- src/backend/access/rmgrdesc/xlogdesc.c | 21 +++-- src/backend/access/transam/README | 4 +- src/backend/access/transam/varsup.c | 94 +++++++++++++++++++++- src/backend/access/transam/xlog.c | 48 +++++++++++ src/backend/access/transam/xlogprefetcher.c | 18 ++--- src/backend/access/transam/xlogrecovery.c | 6 +- src/backend/access/transam/xlogutils.c | 8 +- src/backend/catalog/catalog.c | 93 --------------------- src/backend/catalog/heap.c | 8 +- src/backend/catalog/index.c | 4 +- src/backend/commands/tablecmds.c | 10 ++- src/backend/nodes/outfuncs.c | 2 +- src/backend/replication/logical/decode.c | 1 + src/backend/replication/logical/reorderbuffer.c | 2 +- src/backend/storage/freespace/fsmpage.c | 2 +- src/backend/storage/lmgr/lwlocknames.txt | 1 + src/backend/storage/smgr/smgr.c | 2 +- src/backend/utils/adt/dbsize.c | 4 +- src/backend/utils/adt/pg_upgrade_support.c | 9 ++- src/backend/utils/cache/relcache.c | 5 +- src/backend/utils/cache/relfilenumbermap.c | 4 +- src/backend/utils/misc/pg_controldata.c | 9 ++- src/bin/pg_checksums/pg_checksums.c | 6 +- src/bin/pg_controldata/pg_controldata.c | 2 + src/bin/pg_dump/pg_dump.c | 20 ++--- src/bin/pg_rewind/filemap.c | 6 +- src/bin/pg_upgrade/info.c | 11 +-- src/bin/pg_upgrade/pg_upgrade.c | 6 +- src/bin/pg_upgrade/relfilenumber.c | 4 +- src/bin/pg_waldump/pg_waldump.c | 2 +- src/common/relpath.c | 20 ++--- src/fe_utils/option_utils.c | 42 ++++++++++ src/include/access/transam.h | 5 ++ src/include/access/xlog.h | 1 + src/include/catalog/catalog.h | 3 - src/include/catalog/pg_class.h | 10 +-- src/include/catalog/pg_control.h | 2 + src/include/catalog/pg_proc.dat | 10 +-- src/include/fe_utils/option_utils.h | 3 + src/include/postgres_ext.h | 7 +- src/include/storage/buf_internals.h | 18 +++-- src/include/storage/relfilelocator.h | 12 ++- src/test/regress/expected/alter_table.out | 24 +++--- src/test/regress/sql/alter_table.sql | 8 +- 57 files changed, 423 insertions(+), 242 deletions(-) create mode 100644 contrib/pg_buffercache/pg_buffercache--1.3--1.4.sql diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile index 2ab8c65..2fbb62f 100644 --- a/contrib/pg_buffercache/Makefile +++ b/contrib/pg_buffercache/Makefile @@ -7,7 +7,8 @@ OBJS = \ EXTENSION = pg_buffercache DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \ - pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql + pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql \ + pg_buffercache--1.3--1.4.sql PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time" ifdef USE_PGXS diff --git a/contrib/pg_buffercache/pg_buffercache--1.3--1.4.sql b/contrib/pg_buffercache/pg_buffercache--1.3--1.4.sql new file mode 100644 index 0000000..ee2d9c7 --- /dev/null +++ b/contrib/pg_buffercache/pg_buffercache--1.3--1.4.sql @@ -0,0 +1,30 @@ +/* contrib/pg_buffercache/pg_buffercache--1.3--1.4.sql */ + +-- complain if script is sourced in psql, rather than via ALTER EXTENSION +\echo Use "ALTER EXTENSION pg_buffercache UPDATE TO '1.4'" to load this file. \quit + +/* First we have to remove them from the extension */ +ALTER EXTENSION pg_buffercache DROP VIEW pg_buffercache; +ALTER EXTENSION pg_buffercache DROP FUNCTION pg_buffercache_pages(); + +/* Then we can drop them */ +DROP VIEW pg_buffercache; +DROP FUNCTION pg_buffercache_pages(); + +/* Now redefine */ +CREATE OR REPLACE FUNCTION pg_buffercache_pages() +RETURNS SETOF RECORD +AS 'MODULE_PATHNAME', 'pg_buffercache_pages_v1_4' +LANGUAGE C PARALLEL SAFE; + +CREATE OR REPLACE VIEW pg_buffercache AS + SELECT P.* FROM pg_buffercache_pages() AS P + (bufferid integer, relfilenode int8, reltablespace oid, reldatabase oid, + relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2, + pinning_backends int4); + +-- Don't want these to be available to public. +REVOKE ALL ON FUNCTION pg_buffercache_pages() FROM PUBLIC; +REVOKE ALL ON pg_buffercache FROM PUBLIC; +GRANT EXECUTE ON FUNCTION pg_buffercache_pages() TO pg_monitor; +GRANT SELECT ON pg_buffercache TO pg_monitor; diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control index 8c060ae..a82ae5f 100644 --- a/contrib/pg_buffercache/pg_buffercache.control +++ b/contrib/pg_buffercache/pg_buffercache.control @@ -1,5 +1,5 @@ # pg_buffercache extension comment = 'examine the shared buffer cache' -default_version = '1.3' +default_version = '1.4' module_pathname = '$libdir/pg_buffercache' relocatable = true diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index abc8813..4e3884b 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -59,9 +59,10 @@ typedef struct * relation node/tablespace/database/blocknum and dirty indicator. */ PG_FUNCTION_INFO_V1(pg_buffercache_pages); +PG_FUNCTION_INFO_V1(pg_buffercache_pages_v1_4); -Datum -pg_buffercache_pages(PG_FUNCTION_ARGS) +static Datum +pg_buffercache_pages_internal(PG_FUNCTION_ARGS, Oid rfn_typid) { FuncCallContext *funcctx; Datum result; @@ -103,7 +104,7 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid", INT4OID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode", - OIDOID, -1, 0); + rfn_typid, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace", OIDOID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase", @@ -209,7 +210,16 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) } else { - values[1] = ObjectIdGetDatum(fctx->record[i].relfilenumber); + if (rfn_typid == INT8OID) + values[1] = + Int64GetDatum((int64) fctx->record[i].relfilenumber); + else + { + Assert(rfn_typid == OIDOID); + values[1] = + ObjectIdGetDatum((Oid) fctx->record[i].relfilenumber); + } + nulls[1] = false; values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace); nulls[2] = false; @@ -237,3 +247,16 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) else SRF_RETURN_DONE(funcctx); } + +Datum +pg_buffercache_pages(PG_FUNCTION_ARGS) +{ + return pg_buffercache_pages_internal(fcinfo, OIDOID); +} + +/* entry point for old extension version */ +Datum +pg_buffercache_pages_v1_4(PG_FUNCTION_ARGS) +{ + return pg_buffercache_pages_internal(fcinfo, INT8OID); +} diff --git a/contrib/pg_walinspect/expected/pg_walinspect.out b/contrib/pg_walinspect/expected/pg_walinspect.out index a1ee743..e9b06ed 100644 --- a/contrib/pg_walinspect/expected/pg_walinspect.out +++ b/contrib/pg_walinspect/expected/pg_walinspect.out @@ -54,9 +54,9 @@ SELECT COUNT(*) >= 0 AS ok FROM pg_get_wal_stats_till_end_of_wal(:'wal_lsn1'); -- =================================================================== -- Test for filtering out WAL records of a particular table -- =================================================================== -SELECT oid AS sample_tbl_oid FROM pg_class WHERE relname = 'sample_tbl' \gset +SELECT relfilenode AS sample_tbl_relfilenode FROM pg_class WHERE relname = 'sample_tbl' \gset SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_records_info(:'wal_lsn1', :'wal_lsn2') - WHERE block_ref LIKE concat('%', :'sample_tbl_oid', '%') AND resource_manager = 'Heap'; + WHERE block_ref LIKE concat('%', :'sample_tbl_relfilenode', '%') AND resource_manager = 'Heap'; ok ---- t diff --git a/contrib/pg_walinspect/sql/pg_walinspect.sql b/contrib/pg_walinspect/sql/pg_walinspect.sql index 1b265ea..5393834 100644 --- a/contrib/pg_walinspect/sql/pg_walinspect.sql +++ b/contrib/pg_walinspect/sql/pg_walinspect.sql @@ -39,10 +39,10 @@ SELECT COUNT(*) >= 0 AS ok FROM pg_get_wal_stats_till_end_of_wal(:'wal_lsn1'); -- Test for filtering out WAL records of a particular table -- =================================================================== -SELECT oid AS sample_tbl_oid FROM pg_class WHERE relname = 'sample_tbl' \gset +SELECT relfilenode AS sample_tbl_relfilenode FROM pg_class WHERE relname = 'sample_tbl' \gset SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_records_info(:'wal_lsn1', :'wal_lsn2') - WHERE block_ref LIKE concat('%', :'sample_tbl_oid', '%') AND resource_manager = 'Heap'; + WHERE block_ref LIKE concat('%', :'sample_tbl_relfilenode', '%') AND resource_manager = 'Heap'; -- =================================================================== -- Test for filtering out WAL records based on resource_manager and diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index 25b02c4..076bf8f 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -1965,7 +1965,7 @@ SCRAM-SHA-256$<iteration count>:&l - relfilenode oid + relfilenode int8 Name of the on-disk file of this relation; zero means this diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml index a06fd3e..e222265 100644 --- a/doc/src/sgml/pgbuffercache.sgml +++ b/doc/src/sgml/pgbuffercache.sgml @@ -62,7 +62,7 @@ - relfilenode oid + relfilenode int8 (references pg_class.relfilenode) diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c index 41b9211..b75ad79 100644 --- a/src/backend/access/gin/ginxlog.c +++ b/src/backend/access/gin/ginxlog.c @@ -100,7 +100,7 @@ ginRedoInsertEntry(Buffer buffer, bool isLeaf, BlockNumber rightblkno, void *rda BlockNumber blknum; BufferGetTag(buffer, &locator, &forknum, &blknum); - elog(ERROR, "failed to add item to index page in %u/%u/%u", + elog(ERROR, "failed to add item to index page in %u/%u/" INT64_FORMAT, locator.spcOid, locator.dbOid, locator.relNumber); } } diff --git a/src/backend/access/rmgrdesc/gistdesc.c b/src/backend/access/rmgrdesc/gistdesc.c index 7dd3c1d..c699937 100644 --- a/src/backend/access/rmgrdesc/gistdesc.c +++ b/src/backend/access/rmgrdesc/gistdesc.c @@ -26,7 +26,7 @@ out_gistxlogPageUpdate(StringInfo buf, gistxlogPageUpdate *xlrec) static void out_gistxlogPageReuse(StringInfo buf, gistxlogPageReuse *xlrec) { - appendStringInfo(buf, "rel %u/%u/%u; blk %u; latestRemovedXid %u:%u", + appendStringInfo(buf, "rel %u/%u/" INT64_FORMAT "; blk %u; latestRemovedXid %u:%u", xlrec->locator.spcOid, xlrec->locator.dbOid, xlrec->locator.relNumber, xlrec->block, EpochFromFullTransactionId(xlrec->latestRemovedFullXid), diff --git a/src/backend/access/rmgrdesc/heapdesc.c b/src/backend/access/rmgrdesc/heapdesc.c index 923d3bc..f6d278b 100644 --- a/src/backend/access/rmgrdesc/heapdesc.c +++ b/src/backend/access/rmgrdesc/heapdesc.c @@ -169,7 +169,7 @@ heap2_desc(StringInfo buf, XLogReaderState *record) { xl_heap_new_cid *xlrec = (xl_heap_new_cid *) rec; - appendStringInfo(buf, "rel %u/%u/%u; tid %u/%u", + appendStringInfo(buf, "rel %u/%u/" INT64_FORMAT "; tid %u/%u", xlrec->target_locator.spcOid, xlrec->target_locator.dbOid, xlrec->target_locator.relNumber, diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c index 4843cd5..70feb2d 100644 --- a/src/backend/access/rmgrdesc/nbtdesc.c +++ b/src/backend/access/rmgrdesc/nbtdesc.c @@ -100,7 +100,7 @@ btree_desc(StringInfo buf, XLogReaderState *record) { xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) rec; - appendStringInfo(buf, "rel %u/%u/%u; latestRemovedXid %u:%u", + appendStringInfo(buf, "rel %u/%u/" INT64_FORMAT "; latestRemovedXid %u:%u", xlrec->locator.spcOid, xlrec->locator.dbOid, xlrec->locator.relNumber, EpochFromFullTransactionId(xlrec->latestRemovedFullXid), diff --git a/src/backend/access/rmgrdesc/seqdesc.c b/src/backend/access/rmgrdesc/seqdesc.c index b3845f9..45c6ee7 100644 --- a/src/backend/access/rmgrdesc/seqdesc.c +++ b/src/backend/access/rmgrdesc/seqdesc.c @@ -25,7 +25,7 @@ seq_desc(StringInfo buf, XLogReaderState *record) xl_seq_rec *xlrec = (xl_seq_rec *) rec; if (info == XLOG_SEQ_LOG) - appendStringInfo(buf, "rel %u/%u/%u", + appendStringInfo(buf, "rel %u/%u/" INT64_FORMAT, xlrec->locator.spcOid, xlrec->locator.dbOid, xlrec->locator.relNumber); } diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index 6fec485..e21559d 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -45,8 +45,8 @@ xlog_desc(StringInfo buf, XLogReaderState *record) CheckPoint *checkpoint = (CheckPoint *) rec; appendStringInfo(buf, "redo %X/%X; " - "tli %u; prev tli %u; fpw %s; xid %u:%u; oid %u; multi %u; offset %u; " - "oldest xid %u in DB %u; oldest multi %u in DB %u; " + "tli %u; prev tli %u; fpw %s; xid %u:%u; relfilenumber " INT64_FORMAT ";oid %u; " + "multi %u; offset %u; oldest xid %u in DB %u; oldest multi %u in DB %u; " "oldest/newest commit timestamp xid: %u/%u; " "oldest running xid %u; %s", LSN_FORMAT_ARGS(checkpoint->redo), @@ -55,6 +55,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record) checkpoint->fullPageWrites ? "true" : "false", EpochFromFullTransactionId(checkpoint->nextXid), XidFromFullTransactionId(checkpoint->nextXid), + checkpoint->nextRelFileNumber, checkpoint->nextOid, checkpoint->nextMulti, checkpoint->nextMultiOffset, @@ -74,6 +75,13 @@ xlog_desc(StringInfo buf, XLogReaderState *record) memcpy(&nextOid, rec, sizeof(Oid)); appendStringInfo(buf, "%u", nextOid); } + else if (info == XLOG_NEXT_RELFILENUMBER) + { + RelFileNumber nextRelFileNumber; + + memcpy(&nextRelFileNumber, rec, sizeof(RelFileNumber)); + appendStringInfo(buf, INT64_FORMAT, nextRelFileNumber); + } else if (info == XLOG_RESTORE_POINT) { xl_restore_point *xlrec = (xl_restore_point *) rec; @@ -169,6 +177,9 @@ xlog_identify(uint8 info) case XLOG_NEXTOID: id = "NEXTOID"; break; + case XLOG_NEXT_RELFILENUMBER: + id = "NEXT_RELFILENUMBER"; + break; case XLOG_SWITCH: id = "SWITCH"; break; @@ -237,7 +248,7 @@ XLogRecGetBlockRefInfo(XLogReaderState *record, bool pretty, appendStringInfoChar(buf, ' '); appendStringInfo(buf, - "blkref #%d: rel %u/%u/%u fork %s blk %u", + "blkref #%d: rel %u/%u/" INT64_FORMAT " fork %s blk %u", block_id, rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, forkNames[forknum], @@ -297,7 +308,7 @@ XLogRecGetBlockRefInfo(XLogReaderState *record, bool pretty, if (forknum != MAIN_FORKNUM) { appendStringInfo(buf, - ", blkref #%d: rel %u/%u/%u fork %s blk %u", + ", blkref #%d: rel %u/%u/" INT64_FORMAT " fork %s blk %u", block_id, rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, forkNames[forknum], @@ -306,7 +317,7 @@ XLogRecGetBlockRefInfo(XLogReaderState *record, bool pretty, else { appendStringInfo(buf, - ", blkref #%d: rel %u/%u/%u blk %u", + ", blkref #%d: rel %u/%u/" INT64_FORMAT " blk %u", block_id, rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, blk); diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README index 565f994..c72f4fb 100644 --- a/src/backend/access/transam/README +++ b/src/backend/access/transam/README @@ -692,8 +692,8 @@ by having database restart search for files that don't have any committed entry in pg_class, but that currently isn't done because of the possibility of deleting data that is useful for forensic analysis of the crash. Orphan files are harmless --- at worst they waste a bit of disk space --- -because we check for on-disk collisions when allocating new relfilenumber -OIDs. So cleaning up isn't really necessary. +because relfilenumber is 56 bit wide so logically there should not be any +collisions. So cleaning up isn't really necessary. 3. Deleting a table, which requires an unlink() that could fail. diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c index 849a7ce..430e294 100644 --- a/src/backend/access/transam/varsup.c +++ b/src/backend/access/transam/varsup.c @@ -30,6 +30,9 @@ /* Number of OIDs to prefetch (preallocate) per XLOG write */ #define VAR_OID_PREFETCH 8192 +/* Number of RelFileNumbers to prefetch (preallocate) per XLOG write */ +#define VAR_RFN_PREFETCH 64 + /* pointer to "variable cache" in shared memory (set up by shmem.c) */ VariableCache ShmemVariableCache = NULL; @@ -521,8 +524,7 @@ ForceTransactionIdLimitUpdate(void) * wide, counter wraparound will occur eventually, and therefore it is unwise * to assume they are unique unless precautions are taken to make them so. * Hence, this routine should generally not be used directly. The only direct - * callers should be GetNewOidWithIndex() and GetNewRelFileNumber() in - * catalog/catalog.c. + * callers should be GetNewOidWithIndex() in catalog/catalog.c. */ Oid GetNewObjectId(void) @@ -613,6 +615,94 @@ SetNextObjectId(Oid nextOid) } /* + * GenerateNewRelFileNumber + * + * Similar to GetNewObjectId but instead of new Oid it generates new + * relfilenumber. + */ +RelFileNumber +GetNewRelFileNumber(void) +{ + RelFileNumber result; + + /* Safety check, we should never get this far in a HS standby */ + if (RecoveryInProgress()) + elog(ERROR, "cannot assign RelFileNumber during recovery"); + + LWLockAcquire(RelFileNumberGenLock, LW_EXCLUSIVE); + + /* Check for the wraparound for the relfilenumber counter */ + if (unlikely (ShmemVariableCache->nextRelFileNumber > MAX_RELFILENUMBER)) + elog(ERROR, "relfilenumber is out of bound"); + + /* If we run out of logged for use RelFileNumber then we must log more */ + if (ShmemVariableCache->relnumbercount == 0) + { + XLogPutNextRelFileNumber(ShmemVariableCache->nextRelFileNumber + + VAR_RFN_PREFETCH); + + ShmemVariableCache->relnumbercount = VAR_RFN_PREFETCH; + } + + result = ShmemVariableCache->nextRelFileNumber; + (ShmemVariableCache->nextRelFileNumber)++; + (ShmemVariableCache->relnumbercount)--; + + LWLockRelease(RelFileNumberGenLock); + + return result; +} + +/* + * SetNextRelFileNumber + * + * This may only be called during pg_upgrade; it advances the RelFileNumber + * counter to the specified value if the current value is smaller than the + * input value. + */ +void +SetNextRelFileNumber(RelFileNumber relnumber) +{ + int relnumbercount; + + /* Safety check, we should never get this far in a HS standby */ + if (RecoveryInProgress()) + elog(ERROR, "cannot forward RelFileNumber during recovery"); + + LWLockAcquire(RelFileNumberGenLock, LW_EXCLUSIVE); + + /* + * If previous assigned value of the nextRelFileNumber is already higher + * than the current value then nothing to be done. This is possible + * because during upgrade the relfilenode for the objects can be in any + * order. + */ + if (relnumber <= ShmemVariableCache->nextRelFileNumber) + { + LWLockRelease(RelFileNumberGenLock); + return; + } + + /* + * Check if we set the new relfilenumber then do we run out of the logged + * relnumber, if so then we need to WAL log again. Otherwise, just adjust + * the relnumbercount. + */ + relnumbercount = relnumber - ShmemVariableCache->nextRelFileNumber; + if (ShmemVariableCache->relnumbercount <= relnumbercount) + { + XLogPutNextRelFileNumber(relnumber + VAR_RFN_PREFETCH); + ShmemVariableCache->relnumbercount = VAR_RFN_PREFETCH; + } + else + ShmemVariableCache->relnumbercount -= relnumbercount; + + ShmemVariableCache->nextRelFileNumber = relnumber; + + LWLockRelease(RelFileNumberGenLock); +} + +/* * StopGeneratingPinnedObjectIds * * This is called once during initdb to force the OID counter up to diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 8764084..302da4a 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -4546,6 +4546,7 @@ BootStrapXLOG(void) checkPoint.nextXid = FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId); checkPoint.nextOid = FirstGenbkiObjectId; + checkPoint.nextRelFileNumber = FirstNormalRelFileNumber; checkPoint.nextMulti = FirstMultiXactId; checkPoint.nextMultiOffset = 0; checkPoint.oldestXid = FirstNormalTransactionId; @@ -4559,7 +4560,9 @@ BootStrapXLOG(void) ShmemVariableCache->nextXid = checkPoint.nextXid; ShmemVariableCache->nextOid = checkPoint.nextOid; + ShmemVariableCache->nextRelFileNumber = checkPoint.nextRelFileNumber; ShmemVariableCache->oidCount = 0; + ShmemVariableCache->relnumbercount = 0; MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); AdvanceOldestClogXid(checkPoint.oldestXid); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); @@ -5026,7 +5029,9 @@ StartupXLOG(void) /* initialize shared memory variables from the checkpoint record */ ShmemVariableCache->nextXid = checkPoint.nextXid; ShmemVariableCache->nextOid = checkPoint.nextOid; + ShmemVariableCache->nextRelFileNumber = checkPoint.nextRelFileNumber; ShmemVariableCache->oidCount = 0; + ShmemVariableCache->relnumbercount = 0; MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); AdvanceOldestClogXid(checkPoint.oldestXid); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); @@ -6475,6 +6480,12 @@ CreateCheckPoint(int flags) checkPoint.nextOid += ShmemVariableCache->oidCount; LWLockRelease(OidGenLock); + LWLockAcquire(RelFileNumberGenLock, LW_SHARED); + checkPoint.nextRelFileNumber = ShmemVariableCache->nextRelFileNumber; + if (!shutdown) + checkPoint.nextRelFileNumber += ShmemVariableCache->relnumbercount; + LWLockRelease(RelFileNumberGenLock); + MultiXactGetCheckptMulti(shutdown, &checkPoint.nextMulti, &checkPoint.nextMultiOffset, @@ -7353,6 +7364,29 @@ XLogPutNextOid(Oid nextOid) } /* + * Similar to the XLogPutNextOid but instead of writing NEXTOID log record it + * writes a NEXT_RELFILENUMBER log record. + */ +void +XLogPutNextRelFileNumber(RelFileNumber nextrelnumber) +{ + XLogRecPtr recptr; + + XLogBeginInsert(); + XLogRegisterData((char *) (&nextrelnumber), sizeof(RelFileNumber)); + recptr = XLogInsert(RM_XLOG_ID, XLOG_NEXT_RELFILENUMBER); + + /* + * Flush xlog record to disk before returning. To protect against file + * system changes reaching the disk before the XLOG_NEXT_RELFILENUMBER log. + * + * This should not impact the performance because we are WAL logging the + * RelFileNumber after assigning every 8192 RelFileNumber + */ + XLogFlush(recptr); +} + +/* * Write an XLOG SWITCH record. * * Here we just blindly issue an XLogInsert request for the record. @@ -7567,6 +7601,16 @@ xlog_redo(XLogReaderState *record) ShmemVariableCache->oidCount = 0; LWLockRelease(OidGenLock); } + if (info == XLOG_NEXT_RELFILENUMBER) + { + RelFileNumber nextRelFileNumber; + + memcpy(&nextRelFileNumber, XLogRecGetData(record), sizeof(RelFileNumber)); + LWLockAcquire(RelFileNumberGenLock, LW_EXCLUSIVE); + ShmemVariableCache->nextRelFileNumber = nextRelFileNumber; + ShmemVariableCache->relnumbercount = 0; + LWLockRelease(RelFileNumberGenLock); + } else if (info == XLOG_CHECKPOINT_SHUTDOWN) { CheckPoint checkPoint; @@ -7581,6 +7625,10 @@ xlog_redo(XLogReaderState *record) ShmemVariableCache->nextOid = checkPoint.nextOid; ShmemVariableCache->oidCount = 0; LWLockRelease(OidGenLock); + LWLockAcquire(RelFileNumberGenLock, LW_EXCLUSIVE); + ShmemVariableCache->nextRelFileNumber = checkPoint.nextRelFileNumber; + ShmemVariableCache->relnumbercount = 0; + LWLockRelease(RelFileNumberGenLock); MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); diff --git a/src/backend/access/transam/xlogprefetcher.c b/src/backend/access/transam/xlogprefetcher.c index d1662f3..a2c57d0 100644 --- a/src/backend/access/transam/xlogprefetcher.c +++ b/src/backend/access/transam/xlogprefetcher.c @@ -572,9 +572,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) /* * Don't try to prefetch anything in this database until - * it has been created, or we might confuse the blocks of - * different generations, if a database OID or - * relfilenumber is reused. It's also more efficient than + * it has been created, because it's more efficient than * discovering that relations don't exist on disk yet with * ENOENT errors. */ @@ -610,7 +608,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "suppressing prefetch in relation %u/%u/%u until %X/%X is replayed, which creates the relation", + "suppressing prefetch in relation %u/%u/" INT64_FORMAT " until %X/%X is replayed, which creates the relation", xlrec->rlocator.spcOid, xlrec->rlocator.dbOid, xlrec->rlocator.relNumber, @@ -633,7 +631,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "suppressing prefetch in relation %u/%u/%u from block %u until %X/%X is replayed, which truncates the relation", + "suppressing prefetch in relation %u/%u/" INT64_FORMAT " from block %u until %X/%X is replayed, which truncates the relation", xlrec->rlocator.spcOid, xlrec->rlocator.dbOid, xlrec->rlocator.relNumber, @@ -732,7 +730,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) { #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "suppressing all prefetch in relation %u/%u/%u until %X/%X is replayed, because the relation does not exist on disk", + "suppressing all prefetch in relation %u/%u/" INT64_FORMAT " until %X/%X is replayed, because the relation does not exist on disk", reln->smgr_rlocator.locator.spcOid, reln->smgr_rlocator.locator.dbOid, reln->smgr_rlocator.locator.relNumber, @@ -753,7 +751,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) { #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "suppressing prefetch in relation %u/%u/%u from block %u until %X/%X is replayed, because the relation is too small", + "suppressing prefetch in relation %u/%u/" INT64_FORMAT " from block %u until %X/%X is replayed, because the relation is too small", reln->smgr_rlocator.locator.spcOid, reln->smgr_rlocator.locator.dbOid, reln->smgr_rlocator.locator.relNumber, @@ -792,7 +790,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) * truncated beneath our feet? */ elog(ERROR, - "could not prefetch relation %u/%u/%u block %u", + "could not prefetch relation %u/%u/" INT64_FORMAT " block %u", reln->smgr_rlocator.locator.spcOid, reln->smgr_rlocator.locator.dbOid, reln->smgr_rlocator.locator.relNumber, @@ -930,7 +928,7 @@ XLogPrefetcherIsFiltered(XLogPrefetcher *prefetcher, RelFileLocator rlocator, { #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%X is replayed (blocks >= %u filtered)", + "prefetch of %u/%u/" INT64_FORMAT " block %u suppressed; filtering until LSN %X/%X is replayed (blocks >= %u filtered)", rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, blockno, LSN_FORMAT_ARGS(filter->filter_until_replayed), filter->filter_from_block); @@ -946,7 +944,7 @@ XLogPrefetcherIsFiltered(XLogPrefetcher *prefetcher, RelFileLocator rlocator, { #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%X is replayed (whole database)", + "prefetch of %u/%u/" INT64_FORMAT " block %u suppressed; filtering until LSN %X/%X is replayed (whole database)", rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, blockno, LSN_FORMAT_ARGS(filter->filter_until_replayed)); #endif diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 5d6f1b5..1d95fc0 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -2175,14 +2175,14 @@ xlog_block_info(StringInfo buf, XLogReaderState *record) continue; if (forknum != MAIN_FORKNUM) - appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u", + appendStringInfo(buf, "; blkref #%d: rel %u/%u/" INT64_FORMAT ", fork %u, blk %u", block_id, rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, forknum, blk); else - appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u", + appendStringInfo(buf, "; blkref #%d: rel %u/%u/" INT64_FORMAT ", blk %u", block_id, rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, @@ -2378,7 +2378,7 @@ verifyBackupPageConsistency(XLogReaderState *record) if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0) { elog(FATAL, - "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u", + "inconsistent page found, rel %u/%u/" INT64_FORMAT ", forknum %u, blkno %u", rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, forknum, blkno); } diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 42a0f51..2f58e77 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -617,17 +617,17 @@ CreateFakeRelcacheEntry(RelFileLocator rlocator) rel->rd_rel->relpersistence = RELPERSISTENCE_PERMANENT; /* We don't know the name of the relation; use relfilelocator instead */ - sprintf(RelationGetRelationName(rel), "%u", rlocator.relNumber); + sprintf(RelationGetRelationName(rel), INT64_FORMAT, rlocator.relNumber); /* * We set up the lockRelId in case anything tries to lock the dummy - * relation. Note that this is fairly bogus since relNumber may be - * different from the relation's OID. It shouldn't really matter though. + * relation. Note we are setting relId to just FirstNormalObjectId which + * is completely bogus. It shouldn't really matter though. * In recovery, we are running by ourselves and can't have any lock * conflicts. While syncing, we already hold AccessExclusiveLock. */ rel->rd_lockInfo.lockRelId.dbId = rlocator.dbOid; - rel->rd_lockInfo.lockRelId.relId = rlocator.relNumber; + rel->rd_lockInfo.lockRelId.relId = FirstNormalObjectId; rel->rd_smgr = NULL; diff --git a/src/backend/catalog/catalog.c b/src/backend/catalog/catalog.c index 2a33273..155400c 100644 --- a/src/backend/catalog/catalog.c +++ b/src/backend/catalog/catalog.c @@ -481,99 +481,6 @@ GetNewOidWithIndex(Relation relation, Oid indexId, AttrNumber oidcolumn) } /* - * GetNewRelFileNumber - * Generate a new relfilenumber that is unique within the - * database of the given tablespace. - * - * If the relfilenumber will also be used as the relation's OID, pass the - * opened pg_class catalog, and this routine will guarantee that the result - * is also an unused OID within pg_class. If the result is to be used only - * as a relfilenumber for an existing relation, pass NULL for pg_class. - * - * As with GetNewOidWithIndex(), there is some theoretical risk of a race - * condition, but it doesn't seem worth worrying about. - * - * Note: we don't support using this in bootstrap mode. All relations - * created by bootstrap have preassigned OIDs, so there's no need. - */ -RelFileNumber -GetNewRelFileNumber(Oid reltablespace, Relation pg_class, char relpersistence) -{ - RelFileLocatorBackend rlocator; - char *rpath; - bool collides; - BackendId backend; - - /* - * If we ever get here during pg_upgrade, there's something wrong; all - * relfilenumber assignments during a binary-upgrade run should be - * determined by commands in the dump script. - */ - Assert(!IsBinaryUpgrade); - - switch (relpersistence) - { - case RELPERSISTENCE_TEMP: - backend = BackendIdForTempRelations(); - break; - case RELPERSISTENCE_UNLOGGED: - case RELPERSISTENCE_PERMANENT: - backend = InvalidBackendId; - break; - default: - elog(ERROR, "invalid relpersistence: %c", relpersistence); - return InvalidOid; /* placate compiler */ - } - - /* This logic should match RelationInitPhysicalAddr */ - rlocator.locator.spcOid = reltablespace ? reltablespace : MyDatabaseTableSpace; - rlocator.locator.dbOid = (rlocator.locator.spcOid == GLOBALTABLESPACE_OID) ? InvalidOid : MyDatabaseId; - - /* - * The relpath will vary based on the backend ID, so we must initialize - * that properly here to make sure that any collisions based on filename - * are properly detected. - */ - rlocator.backend = backend; - - do - { - CHECK_FOR_INTERRUPTS(); - - /* Generate the OID */ - if (pg_class) - rlocator.locator.relNumber = GetNewOidWithIndex(pg_class, ClassOidIndexId, - Anum_pg_class_oid); - else - rlocator.locator.relNumber = GetNewObjectId(); - - /* Check for existing file of same name */ - rpath = relpath(rlocator, MAIN_FORKNUM); - - if (access(rpath, F_OK) == 0) - { - /* definite collision */ - collides = true; - } - else - { - /* - * Here we have a little bit of a dilemma: if errno is something - * other than ENOENT, should we declare a collision and loop? In - * practice it seems best to go ahead regardless of the errno. If - * there is a colliding file we will get an smgr failure when we - * attempt to create the new relation file. - */ - collides = false; - } - - pfree(rpath); - } while (collides); - - return rlocator.locator.relNumber; -} - -/* * SQL callable interface for GetNewOidWithIndex(). Outside of initdb's * direct insertions into catalog tables, and recovering from corruption, this * should rarely be needed. diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index c69c923..02ed007 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -347,7 +347,7 @@ heap_create(const char *relname, * with oid same as relid. */ if (!RelFileNumberIsValid(relfilenumber)) - relfilenumber = relid; + relfilenumber = GetNewRelFileNumber(); } /* @@ -900,7 +900,7 @@ InsertPgClassTuple(Relation pg_class_desc, values[Anum_pg_class_reloftype - 1] = ObjectIdGetDatum(rd_rel->reloftype); values[Anum_pg_class_relowner - 1] = ObjectIdGetDatum(rd_rel->relowner); values[Anum_pg_class_relam - 1] = ObjectIdGetDatum(rd_rel->relam); - values[Anum_pg_class_relfilenode - 1] = ObjectIdGetDatum(rd_rel->relfilenode); + values[Anum_pg_class_relfilenode - 1] = Int64GetDatum(rd_rel->relfilenode); values[Anum_pg_class_reltablespace - 1] = ObjectIdGetDatum(rd_rel->reltablespace); values[Anum_pg_class_relpages - 1] = Int32GetDatum(rd_rel->relpages); values[Anum_pg_class_reltuples - 1] = Float4GetDatum(rd_rel->reltuples); @@ -1231,8 +1231,8 @@ heap_create_with_catalog(const char *relname, } if (!OidIsValid(relid)) - relid = GetNewRelFileNumber(reltablespace, pg_class_desc, - relpersistence); + relid = GetNewOidWithIndex(pg_class_desc, ClassOidIndexId, + Anum_pg_class_oid); } /* diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index f245df8..46b914b 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -937,8 +937,8 @@ index_create(Relation heapRelation, } else { - indexRelationId = - GetNewRelFileNumber(tableSpaceId, pg_class, relpersistence); + indexRelationId = GetNewOidWithIndex(pg_class, ClassOidIndexId, + Anum_pg_class_oid); } } diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index bf645b8..9270aac 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -14371,11 +14371,13 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode) } /* - * Relfilenumbers are not unique in databases across tablespaces, so we need - * to allocate a new one in the new tablespace. + * Generate a new relfilenumber. Although relfilenumber are unique within a + * cluster, we are unable to use the old relfilenumber since unused + * relfilenumber are not unlinked until commit. So if within a + * transaction, if we set the old tablespace again, we will get conflicting + * relfilenumber file. */ - newrelfilenumber = GetNewRelFileNumber(newTableSpace, NULL, - rel->rd_rel->relpersistence); + newrelfilenumber = GetNewRelFileNumber(); /* Open old and new relation */ newrlocator = rel->rd_locator; diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 3724d48..3f2618a 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -2928,7 +2928,7 @@ _outIndexStmt(StringInfo str, const IndexStmt *node) WRITE_NODE_FIELD(excludeOpNames); WRITE_STRING_FIELD(idxcomment); WRITE_OID_FIELD(indexOid); - WRITE_OID_FIELD(oldNumber); + WRITE_UINT64_FIELD(oldNumber); WRITE_UINT_FIELD(oldCreateSubid); WRITE_UINT_FIELD(oldFirstRelfilenumberSubid); WRITE_BOOL_FIELD(unique); diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index c5c6a2b..7029604 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -154,6 +154,7 @@ xlog_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) break; case XLOG_NOOP: case XLOG_NEXTOID: + case XLOG_NEXT_RELFILENUMBER: case XLOG_SWITCH: case XLOG_BACKUP_END: case XLOG_PARAMETER_CHANGE: diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c index f8fb228..4366ae6 100644 --- a/src/backend/replication/logical/reorderbuffer.c +++ b/src/backend/replication/logical/reorderbuffer.c @@ -4869,7 +4869,7 @@ DisplayMapping(HTAB *tuplecid_data) hash_seq_init(&hstat, tuplecid_data); while ((ent = (ReorderBufferTupleCidEnt *) hash_seq_search(&hstat)) != NULL) { - elog(DEBUG3, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u", + elog(DEBUG3, "mapping: node: %u/%u/" INT64_FORMAT " tid: %u/%u cmin: %u, cmax: %u", ent->key.rlocator.dbOid, ent->key.rlocator.spcOid, ent->key.rlocator.relNumber, diff --git a/src/backend/storage/freespace/fsmpage.c b/src/backend/storage/freespace/fsmpage.c index af4dab7..172225b 100644 --- a/src/backend/storage/freespace/fsmpage.c +++ b/src/backend/storage/freespace/fsmpage.c @@ -273,7 +273,7 @@ restart: BlockNumber blknum; BufferGetTag(buf, &rlocator, &forknum, &blknum); - elog(DEBUG1, "fixing corrupt FSM block %u, relation %u/%u/%u", + elog(DEBUG1, "fixing corrupt FSM block %u, relation %u/%u/" INT64_FORMAT, blknum, rlocator.spcOid, rlocator.dbOid, rlocator.relNumber); /* make sure we hold an exclusive lock */ diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index 6c7cf6c..b64dbe7 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -53,3 +53,4 @@ XactTruncationLock 44 # 45 was XactTruncationLock until removal of BackendRandomLock WrapLimitsVacuumLock 46 NotifyQueueTailLock 47 +RelFileNumberGenLock 48 \ No newline at end of file diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index b21d8c3..5f6c12a 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -154,7 +154,7 @@ smgropen(RelFileLocator rlocator, BackendId backend) /* First time through: initialize the hash table */ HASHCTL ctl; - ctl.keysize = sizeof(RelFileLocatorBackend); + ctl.keysize = SizeOfRelFileLocatorBackend; ctl.entrysize = sizeof(SMgrRelationData); SMgrRelationHash = hash_create("smgr relation table", 400, &ctl, HASH_ELEM | HASH_BLOBS); diff --git a/src/backend/utils/adt/dbsize.c b/src/backend/utils/adt/dbsize.c index d8ae082..5bbd847 100644 --- a/src/backend/utils/adt/dbsize.c +++ b/src/backend/utils/adt/dbsize.c @@ -878,7 +878,7 @@ pg_relation_filenode(PG_FUNCTION_ARGS) if (!RelFileNumberIsValid(result)) PG_RETURN_NULL(); - PG_RETURN_OID(result); + PG_RETURN_INT64(result); } /* @@ -898,7 +898,7 @@ Datum pg_filenode_relation(PG_FUNCTION_ARGS) { Oid reltablespace = PG_GETARG_OID(0); - RelFileNumber relfilenumber = PG_GETARG_OID(1); + RelFileNumber relfilenumber = PG_GETARG_INT64(1); Oid heaprel; /* test needed so RelidByRelfilenumber doesn't misbehave */ diff --git a/src/backend/utils/adt/pg_upgrade_support.c b/src/backend/utils/adt/pg_upgrade_support.c index 4408c00..f5b6d41 100644 --- a/src/backend/utils/adt/pg_upgrade_support.c +++ b/src/backend/utils/adt/pg_upgrade_support.c @@ -98,10 +98,11 @@ binary_upgrade_set_next_heap_pg_class_oid(PG_FUNCTION_ARGS) Datum binary_upgrade_set_next_heap_relfilenode(PG_FUNCTION_ARGS) { - RelFileNumber relfilenumber = PG_GETARG_OID(0); + RelFileNumber relfilenumber = PG_GETARG_INT64(0); CHECK_IS_BINARY_UPGRADE; binary_upgrade_next_heap_pg_class_relfilenumber = relfilenumber; + SetNextRelFileNumber(relfilenumber + 1); PG_RETURN_VOID(); } @@ -120,10 +121,11 @@ binary_upgrade_set_next_index_pg_class_oid(PG_FUNCTION_ARGS) Datum binary_upgrade_set_next_index_relfilenode(PG_FUNCTION_ARGS) { - RelFileNumber relfilenumber = PG_GETARG_OID(0); + RelFileNumber relfilenumber = PG_GETARG_INT64(0); CHECK_IS_BINARY_UPGRADE; binary_upgrade_next_index_pg_class_relfilenumber = relfilenumber; + SetNextRelFileNumber(relfilenumber + 1); PG_RETURN_VOID(); } @@ -142,10 +144,11 @@ binary_upgrade_set_next_toast_pg_class_oid(PG_FUNCTION_ARGS) Datum binary_upgrade_set_next_toast_relfilenode(PG_FUNCTION_ARGS) { - RelFileNumber relfilenumber = PG_GETARG_OID(0); + RelFileNumber relfilenumber = PG_GETARG_INT64(0); CHECK_IS_BINARY_UPGRADE; binary_upgrade_next_toast_pg_class_relfilenumber = relfilenumber; + SetNextRelFileNumber(relfilenumber + 1); PG_RETURN_VOID(); } diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index b80e2ec3..57d34cb 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -3630,7 +3630,7 @@ RelationBuildLocalRelation(const char *relname, if (mapped_relation) { - rel->rd_rel->relfilenode = InvalidOid; + rel->rd_rel->relfilenode = InvalidRelFileNumber; /* Add it to the active mapping information */ RelationMapUpdateMap(relid, relfilenumber, shared_relation, true); } @@ -3708,8 +3708,7 @@ RelationSetNewRelfilenumber(Relation relation, char persistence) RelFileLocator newrlocator; /* Allocate a new relfilenumber */ - newrelfilenumber = GetNewRelFileNumber(relation->rd_rel->reltablespace, - NULL, persistence); + newrelfilenumber = GetNewRelFileNumber(); /* * Get a writable copy of the pg_class tuple for the given relation. diff --git a/src/backend/utils/cache/relfilenumbermap.c b/src/backend/utils/cache/relfilenumbermap.c index 3dc45e9..a5ec78c 100644 --- a/src/backend/utils/cache/relfilenumbermap.c +++ b/src/backend/utils/cache/relfilenumbermap.c @@ -196,7 +196,7 @@ RelidByRelfilenumber(Oid reltablespace, RelFileNumber relfilenumber) /* set scan arguments */ skey[0].sk_argument = ObjectIdGetDatum(reltablespace); - skey[1].sk_argument = ObjectIdGetDatum(relfilenumber); + skey[1].sk_argument = Int64GetDatum(relfilenumber); scandesc = systable_beginscan(relation, ClassTblspcRelfilenodeIndexId, @@ -213,7 +213,7 @@ RelidByRelfilenumber(Oid reltablespace, RelFileNumber relfilenumber) if (found) elog(ERROR, - "unexpected duplicate for tablespace %u, relfilenumber %u", + "unexpected duplicate for tablespace %u, relfilenumber " INT64_FORMAT, reltablespace, relfilenumber); found = true; diff --git a/src/backend/utils/misc/pg_controldata.c b/src/backend/utils/misc/pg_controldata.c index 781f8b8..3c1fef4 100644 --- a/src/backend/utils/misc/pg_controldata.c +++ b/src/backend/utils/misc/pg_controldata.c @@ -79,8 +79,8 @@ pg_control_system(PG_FUNCTION_ARGS) Datum pg_control_checkpoint(PG_FUNCTION_ARGS) { - Datum values[18]; - bool nulls[18]; + Datum values[19]; + bool nulls[19]; TupleDesc tupdesc; HeapTuple htup; ControlFileData *ControlFile; @@ -129,6 +129,8 @@ pg_control_checkpoint(PG_FUNCTION_ARGS) XIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 18, "checkpoint_time", TIMESTAMPTZOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 19, "next_relfilenumber", + INT8OID, -1, 0); tupdesc = BlessTupleDesc(tupdesc); /* Read the control file. */ @@ -202,6 +204,9 @@ pg_control_checkpoint(PG_FUNCTION_ARGS) values[17] = TimestampTzGetDatum(time_t_to_timestamptz(ControlFile->checkPointCopy.time)); nulls[17] = false; + values[18] = Int64GetDatum(ControlFile->checkPointCopy.nextRelFileNumber); + nulls[18] = false; + htup = heap_form_tuple(tupdesc, values, nulls); PG_RETURN_DATUM(HeapTupleGetDatum(htup)); diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c index 21dfe1b..65fc623 100644 --- a/src/bin/pg_checksums/pg_checksums.c +++ b/src/bin/pg_checksums/pg_checksums.c @@ -489,9 +489,9 @@ main(int argc, char *argv[]) mode = PG_MODE_ENABLE; break; case 'f': - if (!option_parse_int(optarg, "-f/--filenode", 0, - INT_MAX, - NULL)) + if (!option_parse_int64(optarg, "-f/--filenode", 0, + LLONG_MAX, + NULL)) exit(1); only_filenode = pstrdup(optarg); break; diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index c390ec5..f727078 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -250,6 +250,8 @@ main(int argc, char *argv[]) printf(_("Latest checkpoint's NextXID: %u:%u\n"), EpochFromFullTransactionId(ControlFile->checkPointCopy.nextXid), XidFromFullTransactionId(ControlFile->checkPointCopy.nextXid)); + printf(_("Latest checkpoint's NextRelFileNumber: " INT64_FORMAT "\n"), + ControlFile->checkPointCopy.nextRelFileNumber); printf(_("Latest checkpoint's NextOID: %u\n"), ControlFile->checkPointCopy.nextOid); printf(_("Latest checkpoint's NextMultiXactId: %u\n"), diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 30b2f85..2d70833 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -4837,16 +4837,16 @@ binary_upgrade_set_pg_class_oids(Archive *fout, relkind = *PQgetvalue(upgrade_res, 0, PQfnumber(upgrade_res, "relkind")); - relfilenumber = atooid(PQgetvalue(upgrade_res, 0, - PQfnumber(upgrade_res, "relfilenode"))); + relfilenumber = atorelnumber(PQgetvalue(upgrade_res, 0, + PQfnumber(upgrade_res, "relfilenode"))); toast_oid = atooid(PQgetvalue(upgrade_res, 0, PQfnumber(upgrade_res, "reltoastrelid"))); - toast_relfilenumber = atooid(PQgetvalue(upgrade_res, 0, - PQfnumber(upgrade_res, "toast_relfilenode"))); + toast_relfilenumber = atorelnumber(PQgetvalue(upgrade_res, 0, + PQfnumber(upgrade_res, "toast_relfilenode"))); toast_index_oid = atooid(PQgetvalue(upgrade_res, 0, PQfnumber(upgrade_res, "indexrelid"))); - toast_index_relfilenumber = atooid(PQgetvalue(upgrade_res, 0, - PQfnumber(upgrade_res, "toast_index_relfilenode"))); + toast_index_relfilenumber = atorelnumber(PQgetvalue(upgrade_res, 0, + PQfnumber(upgrade_res, "toast_index_relfilenode"))); appendPQExpBufferStr(upgrade_buffer, "\n-- For binary upgrade, must preserve pg_class oids and relfilenodes\n"); @@ -4864,7 +4864,7 @@ binary_upgrade_set_pg_class_oids(Archive *fout, */ if (RelFileNumberIsValid(relfilenumber) && relkind != RELKIND_PARTITIONED_TABLE) appendPQExpBuffer(upgrade_buffer, - "SELECT pg_catalog.binary_upgrade_set_next_heap_relfilenode('%u'::pg_catalog.oid);\n", + "SELECT pg_catalog.binary_upgrade_set_next_heap_relfilenode('" INT64_FORMAT "'::pg_catalog.int8);\n", relfilenumber); /* @@ -4878,7 +4878,7 @@ binary_upgrade_set_pg_class_oids(Archive *fout, "SELECT pg_catalog.binary_upgrade_set_next_toast_pg_class_oid('%u'::pg_catalog.oid);\n", toast_oid); appendPQExpBuffer(upgrade_buffer, - "SELECT pg_catalog.binary_upgrade_set_next_toast_relfilenode('%u'::pg_catalog.oid);\n", + "SELECT pg_catalog.binary_upgrade_set_next_toast_relfilenode('" INT64_FORMAT "'::pg_catalog.int8);\n", toast_relfilenumber); /* every toast table has an index */ @@ -4886,7 +4886,7 @@ binary_upgrade_set_pg_class_oids(Archive *fout, "SELECT pg_catalog.binary_upgrade_set_next_index_pg_class_oid('%u'::pg_catalog.oid);\n", toast_index_oid); appendPQExpBuffer(upgrade_buffer, - "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('%u'::pg_catalog.oid);\n", + "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('" INT64_FORMAT "'::pg_catalog.int8);\n", toast_index_relfilenumber); } @@ -4899,7 +4899,7 @@ binary_upgrade_set_pg_class_oids(Archive *fout, "SELECT pg_catalog.binary_upgrade_set_next_index_pg_class_oid('%u'::pg_catalog.oid);\n", pg_class_oid); appendPQExpBuffer(upgrade_buffer, - "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('%u'::pg_catalog.oid);\n", + "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('" INT64_FORMAT "'::pg_catalog.int8);\n", relfilenumber); } diff --git a/src/bin/pg_rewind/filemap.c b/src/bin/pg_rewind/filemap.c index 269ed64..8be5e66 100644 --- a/src/bin/pg_rewind/filemap.c +++ b/src/bin/pg_rewind/filemap.c @@ -538,7 +538,7 @@ isRelDataFile(const char *path) segNo = 0; matched = false; - nmatch = sscanf(path, "global/%u.%u", &rlocator.relNumber, &segNo); + nmatch = sscanf(path, "global/" INT64_FORMAT ".%u", &rlocator.relNumber, &segNo); if (nmatch == 1 || nmatch == 2) { rlocator.spcOid = GLOBALTABLESPACE_OID; @@ -547,7 +547,7 @@ isRelDataFile(const char *path) } else { - nmatch = sscanf(path, "base/%u/%u.%u", + nmatch = sscanf(path, "base/%u/" INT64_FORMAT ".%u", &rlocator.dbOid, &rlocator.relNumber, &segNo); if (nmatch == 2 || nmatch == 3) { @@ -556,7 +556,7 @@ isRelDataFile(const char *path) } else { - nmatch = sscanf(path, "pg_tblspc/%u/" TABLESPACE_VERSION_DIRECTORY "/%u/%u.%u", + nmatch = sscanf(path, "pg_tblspc/%u/" TABLESPACE_VERSION_DIRECTORY "/%u/" INT64_FORMAT ".%u", &rlocator.spcOid, &rlocator.dbOid, &rlocator.relNumber, &segNo); if (nmatch == 3 || nmatch == 4) diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c index 5d30b87..ea62e7d 100644 --- a/src/bin/pg_upgrade/info.c +++ b/src/bin/pg_upgrade/info.c @@ -399,11 +399,11 @@ get_rel_infos(ClusterInfo *cluster, DbInfo *dbinfo) i_reloid, i_indtable, i_toastheap, - i_relfilenumber, i_reltablespace; - char query[QUERY_ALLOC]; - char *last_namespace = NULL, - *last_tablespace = NULL; + RelFileNumber i_relfilenumber; + char query[QUERY_ALLOC]; + char *last_namespace = NULL, + *last_tablespace = NULL; query[0] = '\0'; /* initialize query string to empty */ @@ -527,7 +527,8 @@ get_rel_infos(ClusterInfo *cluster, DbInfo *dbinfo) relname = PQgetvalue(res, relnum, i_relname); curr->relname = pg_strdup(relname); - curr->relfilenumber = atooid(PQgetvalue(res, relnum, i_relfilenumber)); + curr->relfilenumber = + atorelnumber(PQgetvalue(res, relnum, i_relfilenumber)); curr->tblsp_alloc = false; /* Is the tablespace oid non-default? */ diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c index 265d829..4c4f03a 100644 --- a/src/bin/pg_upgrade/pg_upgrade.c +++ b/src/bin/pg_upgrade/pg_upgrade.c @@ -15,10 +15,8 @@ * oids are the same between old and new clusters. This is important * because toast oids are stored as toast pointers in user tables. * - * While pg_class.oid and pg_class.relfilenode are initially the same in a - * cluster, they can diverge due to CLUSTER, REINDEX, or VACUUM FULL. We - * control assignments of pg_class.relfilenode because we want the filenames - * to match between the old and new cluster. + * We control assignments of pg_class.relfilenode because we want the + * filenames to match between the old and new cluster. * * We control assignment of pg_tablespace.oid because we want the oid to match * between the old and new cluster. diff --git a/src/bin/pg_upgrade/relfilenumber.c b/src/bin/pg_upgrade/relfilenumber.c index b3ad820..50e94df 100644 --- a/src/bin/pg_upgrade/relfilenumber.c +++ b/src/bin/pg_upgrade/relfilenumber.c @@ -190,14 +190,14 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro else snprintf(extent_suffix, sizeof(extent_suffix), ".%d", segno); - snprintf(old_file, sizeof(old_file), "%s%s/%u/%u%s%s", + snprintf(old_file, sizeof(old_file), "%s%s/%u/" INT64_FORMAT "%s%s", map->old_tablespace, map->old_tablespace_suffix, map->db_oid, map->relfilenumber, type_suffix, extent_suffix); - snprintf(new_file, sizeof(new_file), "%s%s/%u/%u%s%s", + snprintf(new_file, sizeof(new_file), "%s%s/%u/" INT64_FORMAT "%s%s", map->new_tablespace, map->new_tablespace_suffix, map->db_oid, diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c index 0fdde9d..e5b0b50 100644 --- a/src/bin/pg_waldump/pg_waldump.c +++ b/src/bin/pg_waldump/pg_waldump.c @@ -884,7 +884,7 @@ main(int argc, char **argv) } break; case 'R': - if (sscanf(optarg, "%u/%u/%u", + if (sscanf(optarg, "%u/%u/" INT64_FORMAT, &config.filter_by_relation.spcOid, &config.filter_by_relation.dbOid, &config.filter_by_relation.relNumber) != 3 || diff --git a/src/common/relpath.c b/src/common/relpath.c index 1b6b620..0774e3f 100644 --- a/src/common/relpath.c +++ b/src/common/relpath.c @@ -149,10 +149,10 @@ GetRelationPath(Oid dbOid, Oid spcOid, RelFileNumber relNumber, Assert(dbOid == 0); Assert(backendId == InvalidBackendId); if (forkNumber != MAIN_FORKNUM) - path = psprintf("global/%u_%s", + path = psprintf("global/" INT64_FORMAT "_%s", relNumber, forkNames[forkNumber]); else - path = psprintf("global/%u", relNumber); + path = psprintf("global/" INT64_FORMAT, relNumber); } else if (spcOid == DEFAULTTABLESPACE_OID) { @@ -160,21 +160,21 @@ GetRelationPath(Oid dbOid, Oid spcOid, RelFileNumber relNumber, if (backendId == InvalidBackendId) { if (forkNumber != MAIN_FORKNUM) - path = psprintf("base/%u/%u_%s", + path = psprintf("base/%u/" INT64_FORMAT "_%s", dbOid, relNumber, forkNames[forkNumber]); else - path = psprintf("base/%u/%u", + path = psprintf("base/%u/" INT64_FORMAT, dbOid, relNumber); } else { if (forkNumber != MAIN_FORKNUM) - path = psprintf("base/%u/t%d_%u_%s", + path = psprintf("base/%u/t%d_" INT64_FORMAT "_%s", dbOid, backendId, relNumber, forkNames[forkNumber]); else - path = psprintf("base/%u/t%d_%u", + path = psprintf("base/%u/t%d_" INT64_FORMAT, dbOid, backendId, relNumber); } } @@ -184,24 +184,24 @@ GetRelationPath(Oid dbOid, Oid spcOid, RelFileNumber relNumber, if (backendId == InvalidBackendId) { if (forkNumber != MAIN_FORKNUM) - path = psprintf("pg_tblspc/%u/%s/%u/%u_%s", + path = psprintf("pg_tblspc/%u/%s/%u/" INT64_FORMAT "_%s", spcOid, TABLESPACE_VERSION_DIRECTORY, dbOid, relNumber, forkNames[forkNumber]); else - path = psprintf("pg_tblspc/%u/%s/%u/%u", + path = psprintf("pg_tblspc/%u/%s/%u/" INT64_FORMAT, spcOid, TABLESPACE_VERSION_DIRECTORY, dbOid, relNumber); } else { if (forkNumber != MAIN_FORKNUM) - path = psprintf("pg_tblspc/%u/%s/%u/t%d_%u_%s", + path = psprintf("pg_tblspc/%u/%s/%u/t%d_" INT64_FORMAT "_%s", spcOid, TABLESPACE_VERSION_DIRECTORY, dbOid, backendId, relNumber, forkNames[forkNumber]); else - path = psprintf("pg_tblspc/%u/%s/%u/t%d_%u", + path = psprintf("pg_tblspc/%u/%s/%u/t%d_" INT64_FORMAT, spcOid, TABLESPACE_VERSION_DIRECTORY, dbOid, backendId, relNumber); } diff --git a/src/fe_utils/option_utils.c b/src/fe_utils/option_utils.c index abea881..2cb3370 100644 --- a/src/fe_utils/option_utils.c +++ b/src/fe_utils/option_utils.c @@ -82,3 +82,45 @@ option_parse_int(const char *optarg, const char *optname, *result = val; return true; } + +/* + * option_parse_int64 + * + * Same as option_parse_int but parse int64. + */ +bool +option_parse_int64(const char *optarg, const char *optname, + int64 min_range, int64 max_range, + int64 *result) +{ + char *endptr; + int64 val; + + errno = 0; + val = strtoi64(optarg, &endptr, 10); + + /* + * Skip any trailing whitespace; if anything but whitespace remains before + * the terminating character, fail. + */ + while (*endptr != '\0' && isspace((unsigned char) *endptr)) + endptr++; + + if (*endptr != '\0') + { + pg_log_error("invalid value \"%s\" for option %s", + optarg, optname); + return false; + } + + if (errno == ERANGE || val < min_range || val > max_range) + { + pg_log_error("%s must be in range " INT64_FORMAT ".." INT64_FORMAT, + optname, min_range, max_range); + return false; + } + + if (result) + *result = val; + return true; +} diff --git a/src/include/access/transam.h b/src/include/access/transam.h index 775471d..37afdd1 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -213,6 +213,9 @@ typedef struct VariableCacheData */ Oid nextOid; /* next OID to assign */ uint32 oidCount; /* OIDs available before must do XLOG work */ + RelFileNumber nextRelFileNumber; /* next relfilenumber to assign */ + uint32 relnumbercount; /* relfilenumbers available before must do + XLOG work */ /* * These fields are protected by XidGenLock. @@ -293,6 +296,8 @@ extern void SetTransactionIdLimit(TransactionId oldest_datfrozenxid, extern void AdvanceOldestClogXid(TransactionId oldest_datfrozenxid); extern bool ForceTransactionIdLimitUpdate(void); extern Oid GetNewObjectId(void); +extern RelFileNumber GetNewRelFileNumber(void); +extern void SetNextRelFileNumber(RelFileNumber relnumber); extern void StopGeneratingPinnedObjectIds(void); #ifdef USE_ASSERT_CHECKING diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index cd674c3..4cae54b 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -234,6 +234,7 @@ extern void CreateCheckPoint(int flags); extern bool CreateRestartPoint(int flags); extern WALAvailability GetWALAvailability(XLogRecPtr targetLSN); extern void XLogPutNextOid(Oid nextOid); +extern void XLogPutNextRelFileNumber(RelFileNumber nextrelnumber); extern XLogRecPtr XLogRestorePoint(const char *rpName); extern void UpdateFullPageWrites(void); extern void GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p); diff --git a/src/include/catalog/catalog.h b/src/include/catalog/catalog.h index 66900f1..b452530 100644 --- a/src/include/catalog/catalog.h +++ b/src/include/catalog/catalog.h @@ -38,8 +38,5 @@ extern bool IsPinnedObject(Oid classId, Oid objectId); extern Oid GetNewOidWithIndex(Relation relation, Oid indexId, AttrNumber oidcolumn); -extern RelFileNumber GetNewRelFileNumber(Oid reltablespace, - Relation pg_class, - char relpersistence); #endif /* CATALOG_H */ diff --git a/src/include/catalog/pg_class.h b/src/include/catalog/pg_class.h index e1f4eef..1cf039c 100644 --- a/src/include/catalog/pg_class.h +++ b/src/include/catalog/pg_class.h @@ -31,6 +31,10 @@ */ CATALOG(pg_class,1259,RelationRelationId) BKI_BOOTSTRAP BKI_ROWTYPE_OID(83,RelationRelation_Rowtype_Id) BKI_SCHEMA_MACRO { + /* identifier of physical storage file */ + /* relfilenode == 0 means it is a "mapped" relation, see relmapper.c */ + int64 relfilenode BKI_DEFAULT(0); + /* oid */ Oid oid; @@ -52,10 +56,6 @@ CATALOG(pg_class,1259,RelationRelationId) BKI_BOOTSTRAP BKI_ROWTYPE_OID(83,Relat /* access method; 0 if not a table / index */ Oid relam BKI_DEFAULT(heap) BKI_LOOKUP_OPT(pg_am); - /* identifier of physical storage file */ - /* relfilenode == 0 means it is a "mapped" relation, see relmapper.c */ - Oid relfilenode BKI_DEFAULT(0); - /* identifier of table space for relation (0 means default for database) */ Oid reltablespace BKI_DEFAULT(0) BKI_LOOKUP_OPT(pg_tablespace); @@ -154,7 +154,7 @@ typedef FormData_pg_class *Form_pg_class; DECLARE_UNIQUE_INDEX_PKEY(pg_class_oid_index, 2662, ClassOidIndexId, on pg_class using btree(oid oid_ops)); DECLARE_UNIQUE_INDEX(pg_class_relname_nsp_index, 2663, ClassNameNspIndexId, on pg_class using btree(relname name_ops, relnamespace oid_ops)); -DECLARE_INDEX(pg_class_tblspc_relfilenode_index, 3455, ClassTblspcRelfilenodeIndexId, on pg_class using btree(reltablespace oid_ops, relfilenode oid_ops)); +DECLARE_INDEX(pg_class_tblspc_relfilenode_index, 3455, ClassTblspcRelfilenodeIndexId, on pg_class using btree(reltablespace oid_ops, relfilenode int8_ops)); #ifdef EXPOSE_TO_CLIENT_CODE diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index 06368e2..d5e6172 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -41,6 +41,7 @@ typedef struct CheckPoint * timeline (equals ThisTimeLineID otherwise) */ bool fullPageWrites; /* current full_page_writes */ FullTransactionId nextXid; /* next free transaction ID */ + RelFileNumber nextRelFileNumber; /* next relfilenumber */ Oid nextOid; /* next free OID */ MultiXactId nextMulti; /* next free MultiXactId */ MultiXactOffset nextMultiOffset; /* next free MultiXact offset */ @@ -78,6 +79,7 @@ typedef struct CheckPoint #define XLOG_FPI 0xB0 /* 0xC0 is used in Postgres 9.5-11 */ #define XLOG_OVERWRITE_CONTRECORD 0xD0 +#define XLOG_NEXT_RELFILENUMBER 0xE0 /* diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index a77b293..68944fd 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -7321,11 +7321,11 @@ proname => 'pg_indexes_size', provolatile => 'v', prorettype => 'int8', proargtypes => 'regclass', prosrc => 'pg_indexes_size' }, { oid => '2999', descr => 'filenode identifier of relation', - proname => 'pg_relation_filenode', provolatile => 's', prorettype => 'oid', + proname => 'pg_relation_filenode', provolatile => 's', prorettype => 'int8', proargtypes => 'regclass', prosrc => 'pg_relation_filenode' }, { oid => '3454', descr => 'relation OID for filenode and tablespace', proname => 'pg_filenode_relation', provolatile => 's', - prorettype => 'regclass', proargtypes => 'oid oid', + prorettype => 'regclass', proargtypes => 'oid int8', prosrc => 'pg_filenode_relation' }, { oid => '3034', descr => 'file path of relation', proname => 'pg_relation_filepath', provolatile => 's', prorettype => 'text', @@ -11191,15 +11191,15 @@ prosrc => 'binary_upgrade_set_missing_value' }, { oid => '4545', descr => 'for use by pg_upgrade', proname => 'binary_upgrade_set_next_heap_relfilenode', provolatile => 'v', - proparallel => 'u', prorettype => 'void', proargtypes => 'oid', + proparallel => 'u', prorettype => 'void', proargtypes => 'int8', prosrc => 'binary_upgrade_set_next_heap_relfilenode' }, { oid => '4546', descr => 'for use by pg_upgrade', proname => 'binary_upgrade_set_next_index_relfilenode', provolatile => 'v', - proparallel => 'u', prorettype => 'void', proargtypes => 'oid', + proparallel => 'u', prorettype => 'void', proargtypes => 'int8', prosrc => 'binary_upgrade_set_next_index_relfilenode' }, { oid => '4547', descr => 'for use by pg_upgrade', proname => 'binary_upgrade_set_next_toast_relfilenode', provolatile => 'v', - proparallel => 'u', prorettype => 'void', proargtypes => 'oid', + proparallel => 'u', prorettype => 'void', proargtypes => 'int8', prosrc => 'binary_upgrade_set_next_toast_relfilenode' }, { oid => '4548', descr => 'for use by pg_upgrade', proname => 'binary_upgrade_set_next_pg_tablespace_oid', provolatile => 'v', diff --git a/src/include/fe_utils/option_utils.h b/src/include/fe_utils/option_utils.h index 03c09fd..8c0e818 100644 --- a/src/include/fe_utils/option_utils.h +++ b/src/include/fe_utils/option_utils.h @@ -22,5 +22,8 @@ extern void handle_help_version_opts(int argc, char *argv[], extern bool option_parse_int(const char *optarg, const char *optname, int min_range, int max_range, int *result); +extern bool option_parse_int64(const char *optarg, const char *optname, + int64 min_range, int64 max_range, + int64 *result); #endif /* OPTION_UTILS_H */ diff --git a/src/include/postgres_ext.h b/src/include/postgres_ext.h index d8af68b..ecdfc90 100644 --- a/src/include/postgres_ext.h +++ b/src/include/postgres_ext.h @@ -48,11 +48,14 @@ typedef PG_INT64_TYPE pg_int64; /* * RelFileNumber data type identifies the specific relation file name. + * RelFileNumber is unique within a cluster. */ -typedef Oid RelFileNumber; -#define InvalidRelFileNumber ((RelFileNumber) InvalidOid) +typedef pg_int64 RelFileNumber; +#define InvalidRelFileNumber ((RelFileNumber) 0) +#define FirstNormalRelFileNumber ((RelFileNumber) 100000) #define RelFileNumberIsValid(relnumber) \ ((bool) ((relnumber) != InvalidRelFileNumber)) +#define atorelnumber(x) ((RelFileNumber) strtoul((x), NULL, 10)) /* * Identifiers of error message fields. Kept here to keep common diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index b1b8061..bd74219 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -92,16 +92,19 @@ typedef struct buftag { Oid spcOid; /* tablespace oid. */ Oid dbOid; /* database oid. */ - RelFileNumber relNumber; /* relation file number. */ - ForkNumber forkNum; + uint32 relNumber_low; /* relfilenumber 32 lower bits */ + uint32 relNumber_hi:24; /* relfilenumber 24 high bits */ + uint32 forkNum:8; /* fork number */ BlockNumber blockNum; /* blknum relative to begin of reln */ } BufferTag; -#define BufTagGetFileNumber(a) ((a).relNumber) +#define BufTagGetFileNumber(a) \ + ((((uint64) (a).relNumber_hi << 32) | ((uint32) (a).relNumber_low))) #define BufTagSetFileNumber(a, relnumber) \ ( \ - (a).relNumber = (relnumber) \ + (a).relNumber_hi = (relnumber) >> 32, \ + (a).relNumber_low = (relnumber) & 0xffffffff \ ) #define CLEAR_BUFFERTAG(a) \ @@ -126,7 +129,8 @@ typedef struct buftag ( \ (a).spcOid == (b).spcOid && \ (a).dbOid == (b).dbOid && \ - (a).relNumber == (b).relNumber && \ + (a).relNumber_low == (b).relNumber_low && \ + (a).relNumber_hi == (b).relNumber_hi && \ (a).blockNum == (b).blockNum && \ (a).forkNum == (b).forkNum \ ) @@ -135,14 +139,14 @@ typedef struct buftag do { \ (locator).spcOid = (a).spcOid; \ (locator).dbOid = (a).dbOid; \ - (locator).relNumber = (a).relNumber; \ + (locator).relNumber = BufTagGetFileNumber(a); \ } while(0) #define BuffTagRelFileLocatorEquals(a, locator) \ ( \ (a).spcOid == (locator).spcOid && \ (a).dbOid == (locator).dbOid && \ - (a).relNumber == (locator).relNumber \ + BufTagGetFileNumber(a) == (locator).relNumber \ ) /* diff --git a/src/include/storage/relfilelocator.h b/src/include/storage/relfilelocator.h index 7211fe7..6046506 100644 --- a/src/include/storage/relfilelocator.h +++ b/src/include/storage/relfilelocator.h @@ -34,8 +34,7 @@ * relNumber identifies the specific relation. relNumber corresponds to * pg_class.relfilenode (NOT pg_class.oid, because we need to be able * to assign new physical files to relations in some situations). - * Notice that relNumber is only unique within a database in a particular - * tablespace. + * Notice that relNumber is unique within a cluster. * * Note: spcOid must be GLOBALTABLESPACE_OID if and only if dbOid is * zero. We support shared relations only in the "global" tablespace. @@ -75,6 +74,15 @@ typedef struct RelFileLocatorBackend BackendId backend; } RelFileLocatorBackend; +#define SizeOfRelFileLocatorBackend \ + (offsetof(RelFileLocatorBackend, backend) + sizeof(BackendId)) + +/* + * Max value of the relfilnumber. RelFileNumber will be of 56bits wide for + * more details refer comments atop BufferTag. + */ +#define MAX_RELFILENUMBER INT64CONST(0x00FFFFFFFFFFFFFF) + #define RelFileLocatorBackendIsTemp(rlocator) \ ((rlocator).backend != InvalidBackendId) diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out index 5ede56d..6230fcb 100644 --- a/src/test/regress/expected/alter_table.out +++ b/src/test/regress/expected/alter_table.out @@ -2164,9 +2164,8 @@ select relname, c.oid = oldoid as orig_oid, case relfilenode when 0 then 'none' - when c.oid then 'own' when oldfilenode then 'orig' - else 'OTHER' + else 'new' end as storage, obj_description(c.oid, 'pg_class') as desc from pg_class c left join old_oids using (relname) @@ -2175,10 +2174,10 @@ select relname, relname | orig_oid | storage | desc ------------------------------+----------+---------+--------------- at_partitioned | t | none | - at_partitioned_0 | t | own | - at_partitioned_0_id_name_key | t | own | child 0 index - at_partitioned_1 | t | own | - at_partitioned_1_id_name_key | t | own | child 1 index + at_partitioned_0 | t | orig | + at_partitioned_0_id_name_key | t | orig | child 0 index + at_partitioned_1 | t | orig | + at_partitioned_1_id_name_key | t | orig | child 1 index at_partitioned_id_name_key | t | none | parent index (6 rows) @@ -2198,9 +2197,8 @@ select relname, c.oid = oldoid as orig_oid, case relfilenode when 0 then 'none' - when c.oid then 'own' when oldfilenode then 'orig' - else 'OTHER' + else 'new' end as storage, obj_description(c.oid, 'pg_class') as desc from pg_class c left join old_oids using (relname) @@ -2209,10 +2207,10 @@ select relname, relname | orig_oid | storage | desc ------------------------------+----------+---------+-------------- at_partitioned | t | none | - at_partitioned_0 | t | own | - at_partitioned_0_id_name_key | f | own | parent index - at_partitioned_1 | t | own | - at_partitioned_1_id_name_key | f | own | parent index + at_partitioned_0 | t | orig | + at_partitioned_0_id_name_key | f | new | parent index + at_partitioned_1 | t | orig | + at_partitioned_1_id_name_key | f | new | parent index at_partitioned_id_name_key | f | none | parent index (6 rows) @@ -2556,7 +2554,7 @@ CREATE FUNCTION check_ddl_rewrite(p_tablename regclass, p_ddl text) RETURNS boolean LANGUAGE plpgsql AS $$ DECLARE - v_relfilenode oid; + v_relfilenode int8; BEGIN v_relfilenode := relfilenode FROM pg_class WHERE oid = p_tablename; diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql index 52001e3..4190b12 100644 --- a/src/test/regress/sql/alter_table.sql +++ b/src/test/regress/sql/alter_table.sql @@ -1478,9 +1478,8 @@ select relname, c.oid = oldoid as orig_oid, case relfilenode when 0 then 'none' - when c.oid then 'own' when oldfilenode then 'orig' - else 'OTHER' + else 'new' end as storage, obj_description(c.oid, 'pg_class') as desc from pg_class c left join old_oids using (relname) @@ -1499,9 +1498,8 @@ select relname, c.oid = oldoid as orig_oid, case relfilenode when 0 then 'none' - when c.oid then 'own' when oldfilenode then 'orig' - else 'OTHER' + else 'new' end as storage, obj_description(c.oid, 'pg_class') as desc from pg_class c left join old_oids using (relname) @@ -1638,7 +1636,7 @@ CREATE FUNCTION check_ddl_rewrite(p_tablename regclass, p_ddl text) RETURNS boolean LANGUAGE plpgsql AS $$ DECLARE - v_relfilenode oid; + v_relfilenode int8; BEGIN v_relfilenode := relfilenode FROM pg_class WHERE oid = p_tablename; -- 1.8.3.1