From 044fa268cf3a8e85a8bc3c2f05f4d296a80a6ab1 Mon Sep 17 00:00:00 2001 From: Dilip Kumar Date: Sun, 30 Jan 2022 20:06:59 +0530 Subject: [PATCH v2] Don't delay removing Tombstone file until next checkpoint Currently, we can not remove the unused relfilenode until the next checkpoint because if we remove them immediately then there is a risk of reusing the same relfilenode for two different relations during single checkpoint due to Oid wraparound. With this patch we are removing the need of keeping the Tombstone files until the next checkpoint by making the relfilenode unique within a cluster. For doing that we are making RelFileNode.relNode 64 bits wider so that relfilenode is never reused within a cluster. But that will make buffer tag wider by 32 bits so to avoid that we are removing the ForkNumber from the buffer tag and using 8 high bits of relNode for storing the fork number and remaining 56 bits for the relfilenode. --- .../pg_buffercache/pg_buffercache--1.0--1.1.sql | 2 +- contrib/pg_buffercache/pg_buffercache--1.2.sql | 2 +- contrib/pg_buffercache/pg_buffercache_pages.c | 10 +- contrib/pg_prewarm/autoprewarm.c | 4 +- doc/src/sgml/catalogs.sgml | 2 +- doc/src/sgml/pgbuffercache.sgml | 2 +- src/backend/access/common/syncscan.c | 3 +- src/backend/access/gin/ginxlog.c | 5 +- src/backend/access/rmgrdesc/gistdesc.c | 4 +- src/backend/access/rmgrdesc/heapdesc.c | 4 +- src/backend/access/rmgrdesc/nbtdesc.c | 4 +- src/backend/access/rmgrdesc/seqdesc.c | 4 +- src/backend/access/rmgrdesc/xlogdesc.c | 15 ++- src/backend/access/transam/varsup.c | 52 ++++++++++- src/backend/access/transam/xlog.c | 57 +++++++++--- src/backend/access/transam/xlogutils.c | 9 +- src/backend/catalog/catalog.c | 61 +++---------- src/backend/catalog/heap.c | 23 +++-- src/backend/catalog/index.c | 15 ++- src/backend/catalog/storage.c | 3 +- src/backend/commands/cluster.c | 4 +- src/backend/commands/indexcmds.c | 6 +- src/backend/commands/sequence.c | 2 +- src/backend/commands/tablecmds.c | 23 +++-- src/backend/nodes/outfuncs.c | 2 +- src/backend/parser/parse_utilcmd.c | 4 +- src/backend/replication/logical/decode.c | 1 + src/backend/replication/logical/reorderbuffer.c | 2 +- src/backend/storage/buffer/bufmgr.c | 68 ++++++++------ src/backend/storage/buffer/localbuf.c | 8 +- src/backend/storage/freespace/fsmpage.c | 4 +- src/backend/storage/lmgr/lwlocknames.txt | 1 + src/backend/storage/smgr/md.c | 68 ++++---------- src/backend/storage/sync/sync.c | 101 --------------------- src/backend/utils/adt/dbsize.c | 22 ++--- src/backend/utils/adt/pg_upgrade_support.c | 4 +- src/backend/utils/cache/relcache.c | 35 +++---- src/backend/utils/cache/relfilenodemap.c | 10 +- src/backend/utils/cache/relmapper.c | 39 ++++---- src/backend/utils/misc/pg_controldata.c | 9 +- src/bin/pg_checksums/pg_checksums.c | 6 +- src/bin/pg_controldata/pg_controldata.c | 2 + src/bin/pg_dump/pg_dump.c | 20 ++-- src/bin/pg_rewind/filemap.c | 16 ++-- src/bin/pg_upgrade/info.c | 4 +- src/bin/pg_upgrade/pg_upgrade.h | 4 +- src/bin/pg_upgrade/relfilenode.c | 4 +- src/bin/pg_waldump/pg_waldump.c | 14 +-- src/common/relpath.c | 22 ++--- src/fe_utils/option_utils.c | 42 +++++++++ src/include/access/transam.h | 4 + src/include/access/xlog.h | 1 + src/include/catalog/binary_upgrade.h | 2 +- src/include/catalog/catalog.h | 4 +- src/include/catalog/heap.h | 2 +- src/include/catalog/index.h | 2 +- src/include/catalog/pg_class.h | 10 +- src/include/catalog/pg_control.h | 2 + src/include/catalog/pg_proc.dat | 6 +- src/include/commands/tablecmds.h | 2 +- src/include/common/relpath.h | 6 +- src/include/fe_utils/option_utils.h | 3 + src/include/nodes/parsenodes.h | 2 +- src/include/postgres_ext.h | 15 +++ src/include/storage/buf_internals.h | 12 +-- src/include/storage/relfilenode.h | 70 ++++++++++++-- src/include/storage/sync.h | 1 - src/include/utils/rel.h | 2 +- src/include/utils/relcache.h | 2 +- src/include/utils/relfilenodemap.h | 2 +- src/include/utils/relmapper.h | 6 +- src/test/regress/expected/alter_table.out | 20 ++-- src/test/regress/sql/alter_table.sql | 4 +- 73 files changed, 545 insertions(+), 463 deletions(-) diff --git a/contrib/pg_buffercache/pg_buffercache--1.0--1.1.sql b/contrib/pg_buffercache/pg_buffercache--1.0--1.1.sql index 54d02f5..5e93238 100644 --- a/contrib/pg_buffercache/pg_buffercache--1.0--1.1.sql +++ b/contrib/pg_buffercache/pg_buffercache--1.0--1.1.sql @@ -6,6 +6,6 @@ -- Upgrade view to 1.1. format CREATE OR REPLACE VIEW pg_buffercache AS SELECT P.* FROM pg_buffercache_pages() AS P - (bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid, + (bufferid integer, relfilenode int8, reltablespace oid, reldatabase oid, relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2, pinning_backends int4); diff --git a/contrib/pg_buffercache/pg_buffercache--1.2.sql b/contrib/pg_buffercache/pg_buffercache--1.2.sql index 6ee5d84..f52ddcd 100644 --- a/contrib/pg_buffercache/pg_buffercache--1.2.sql +++ b/contrib/pg_buffercache/pg_buffercache--1.2.sql @@ -12,7 +12,7 @@ LANGUAGE C PARALLEL SAFE; -- Create a view for convenient access. CREATE VIEW pg_buffercache AS SELECT P.* FROM pg_buffercache_pages() AS P - (bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid, + (bufferid integer, relfilenode int8, reltablespace oid, reldatabase oid, relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2, pinning_backends int4); diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index 1bd579f..ab1f959 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -26,7 +26,7 @@ PG_MODULE_MAGIC; typedef struct { uint32 bufferid; - Oid relfilenode; + RelNode relfilenode; Oid reltablespace; Oid reldatabase; ForkNumber forknum; @@ -103,7 +103,7 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid", INT4OID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode", - OIDOID, -1, 0); + INT8OID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace", OIDOID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase", @@ -153,10 +153,10 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) buf_state = LockBufHdr(bufHdr); fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr); - fctx->record[i].relfilenode = bufHdr->tag.rnode.relNode; + fctx->record[i].relfilenode = RELFILENODE_GETRELNODE(bufHdr->tag.rnode); fctx->record[i].reltablespace = bufHdr->tag.rnode.spcNode; fctx->record[i].reldatabase = bufHdr->tag.rnode.dbNode; - fctx->record[i].forknum = bufHdr->tag.forkNum; + fctx->record[i].forknum = RELFILENODE_GETFORKNUM(bufHdr->tag.rnode); fctx->record[i].blocknum = bufHdr->tag.blockNum; fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state); fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state); @@ -209,7 +209,7 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) } else { - values[1] = ObjectIdGetDatum(fctx->record[i].relfilenode); + values[1] = Int8GetDatum(fctx->record[i].relfilenode); nulls[1] = false; values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace); nulls[2] = false; diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c index 5d40fb5..a03fd03 100644 --- a/contrib/pg_prewarm/autoprewarm.c +++ b/contrib/pg_prewarm/autoprewarm.c @@ -617,8 +617,8 @@ apw_dump_now(bool is_bgworker, bool dump_unlogged) { block_info_array[num_blocks].database = bufHdr->tag.rnode.dbNode; block_info_array[num_blocks].tablespace = bufHdr->tag.rnode.spcNode; - block_info_array[num_blocks].filenode = bufHdr->tag.rnode.relNode; - block_info_array[num_blocks].forknum = bufHdr->tag.forkNum; + block_info_array[num_blocks].filenode = RELFILENODE_GETRELNODE(bufHdr->tag.rnode); + block_info_array[num_blocks].forknum = RELFILENODE_GETFORKNUM(bufHdr->tag.rnode); block_info_array[num_blocks].blocknum = bufHdr->tag.blockNum; ++num_blocks; } diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index 1e65c42..6eddce1 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -1960,7 +1960,7 @@ SCRAM-SHA-256$<iteration count>:&l - relfilenode oid + relfilenode int8 Name of the on-disk file of this relation; zero means this diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml index e68d159..631cd2f 100644 --- a/doc/src/sgml/pgbuffercache.sgml +++ b/doc/src/sgml/pgbuffercache.sgml @@ -62,7 +62,7 @@ - relfilenode oid + relfilenode int8 (references pg_class.relfilenode) diff --git a/src/backend/access/common/syncscan.c b/src/backend/access/common/syncscan.c index d5b16c5..386de77 100644 --- a/src/backend/access/common/syncscan.c +++ b/src/backend/access/common/syncscan.c @@ -161,7 +161,8 @@ SyncScanShmemInit(void) */ item->location.relfilenode.spcNode = InvalidOid; item->location.relfilenode.dbNode = InvalidOid; - item->location.relfilenode.relNode = InvalidOid; + RELFILENODE_SETRELNODE(item->location.relfilenode, + InvalidRelfileNode); item->location.location = InvalidBlockNumber; item->prev = (i > 0) ? diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c index 87e8366..131e6f1 100644 --- a/src/backend/access/gin/ginxlog.c +++ b/src/backend/access/gin/ginxlog.c @@ -100,8 +100,9 @@ ginRedoInsertEntry(Buffer buffer, bool isLeaf, BlockNumber rightblkno, void *rda BlockNumber blknum; BufferGetTag(buffer, &node, &forknum, &blknum); - elog(ERROR, "failed to add item to index page in %u/%u/%u", - node.spcNode, node.dbNode, node.relNode); + elog(ERROR, "failed to add item to index page in %u/%u/" INT64_FORMAT, + node.spcNode, node.dbNode, + RELFILENODE_GETRELNODE(node)); } } diff --git a/src/backend/access/rmgrdesc/gistdesc.c b/src/backend/access/rmgrdesc/gistdesc.c index 9cab4fa..7ba70c0 100644 --- a/src/backend/access/rmgrdesc/gistdesc.c +++ b/src/backend/access/rmgrdesc/gistdesc.c @@ -26,9 +26,9 @@ out_gistxlogPageUpdate(StringInfo buf, gistxlogPageUpdate *xlrec) static void out_gistxlogPageReuse(StringInfo buf, gistxlogPageReuse *xlrec) { - appendStringInfo(buf, "rel %u/%u/%u; blk %u; latestRemovedXid %u:%u", + appendStringInfo(buf, "rel %u/%u/" INT64_FORMAT "; blk %u; latestRemovedXid %u:%u", xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode, xlrec->block, + RELFILENODE_GETRELNODE(xlrec->node), xlrec->block, EpochFromFullTransactionId(xlrec->latestRemovedFullXid), XidFromFullTransactionId(xlrec->latestRemovedFullXid)); } diff --git a/src/backend/access/rmgrdesc/heapdesc.c b/src/backend/access/rmgrdesc/heapdesc.c index 6238085..3c28c09 100644 --- a/src/backend/access/rmgrdesc/heapdesc.c +++ b/src/backend/access/rmgrdesc/heapdesc.c @@ -169,10 +169,10 @@ heap2_desc(StringInfo buf, XLogReaderState *record) { xl_heap_new_cid *xlrec = (xl_heap_new_cid *) rec; - appendStringInfo(buf, "rel %u/%u/%u; tid %u/%u", + appendStringInfo(buf, "rel %u/%u/" INT64_FORMAT "; tid %u/%u", xlrec->target_node.spcNode, xlrec->target_node.dbNode, - xlrec->target_node.relNode, + RELFILENODE_GETRELNODE(xlrec->target_node), ItemPointerGetBlockNumber(&(xlrec->target_tid)), ItemPointerGetOffsetNumber(&(xlrec->target_tid))); appendStringInfo(buf, "; cmin: %u, cmax: %u, combo: %u", diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c index dfbbf4e..cfcc3a1 100644 --- a/src/backend/access/rmgrdesc/nbtdesc.c +++ b/src/backend/access/rmgrdesc/nbtdesc.c @@ -100,9 +100,9 @@ btree_desc(StringInfo buf, XLogReaderState *record) { xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) rec; - appendStringInfo(buf, "rel %u/%u/%u; latestRemovedXid %u:%u", + appendStringInfo(buf, "rel %u/%u/" INT64_FORMAT "; latestRemovedXid %u:%u", xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode, + RELFILENODE_GETRELNODE(xlrec->node), EpochFromFullTransactionId(xlrec->latestRemovedFullXid), XidFromFullTransactionId(xlrec->latestRemovedFullXid)); break; diff --git a/src/backend/access/rmgrdesc/seqdesc.c b/src/backend/access/rmgrdesc/seqdesc.c index d9b1e60..ffa6a86 100644 --- a/src/backend/access/rmgrdesc/seqdesc.c +++ b/src/backend/access/rmgrdesc/seqdesc.c @@ -25,9 +25,9 @@ seq_desc(StringInfo buf, XLogReaderState *record) xl_seq_rec *xlrec = (xl_seq_rec *) rec; if (info == XLOG_SEQ_LOG) - appendStringInfo(buf, "rel %u/%u/%u", + appendStringInfo(buf, "rel %u/%u/" INT64_FORMAT, xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode); + RELFILENODE_GETRELNODE(xlrec->node)); } const char * diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index e7452af..9066566 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -45,8 +45,8 @@ xlog_desc(StringInfo buf, XLogReaderState *record) CheckPoint *checkpoint = (CheckPoint *) rec; appendStringInfo(buf, "redo %X/%X; " - "tli %u; prev tli %u; fpw %s; xid %u:%u; oid %u; multi %u; offset %u; " - "oldest xid %u in DB %u; oldest multi %u in DB %u; " + "tli %u; prev tli %u; fpw %s; xid %u:%u; relfilenode " INT64_FORMAT ";oid %u; " + "multi %u; offset %u; oldest xid %u in DB %u; oldest multi %u in DB %u; " "oldest/newest commit timestamp xid: %u/%u; " "oldest running xid %u; %s", LSN_FORMAT_ARGS(checkpoint->redo), @@ -55,6 +55,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record) checkpoint->fullPageWrites ? "true" : "false", EpochFromFullTransactionId(checkpoint->nextXid), XidFromFullTransactionId(checkpoint->nextXid), + checkpoint->nextRelNode, checkpoint->nextOid, checkpoint->nextMulti, checkpoint->nextMultiOffset, @@ -74,6 +75,13 @@ xlog_desc(StringInfo buf, XLogReaderState *record) memcpy(&nextOid, rec, sizeof(Oid)); appendStringInfo(buf, "%u", nextOid); } + else if (info == XLOG_NEXT_RELFILENODE) + { + RelNode nextRelFilenode; + + memcpy(&nextRelFilenode, rec, sizeof(RelNode)); + appendStringInfo(buf, INT64_FORMAT, nextRelFilenode); + } else if (info == XLOG_RESTORE_POINT) { xl_restore_point *xlrec = (xl_restore_point *) rec; @@ -169,6 +177,9 @@ xlog_identify(uint8 info) case XLOG_NEXTOID: id = "NEXTOID"; break; + case XLOG_NEXT_RELFILENODE: + id = "NEXT_RELFILENODE"; + break; case XLOG_SWITCH: id = "SWITCH"; break; diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c index 748120a..7edb9fa 100644 --- a/src/backend/access/transam/varsup.c +++ b/src/backend/access/transam/varsup.c @@ -30,6 +30,9 @@ /* Number of OIDs to prefetch (preallocate) per XLOG write */ #define VAR_OID_PREFETCH 8192 +/* Number of RelFileNode to prefetch (preallocate) per XLOG write */ +#define VAR_RFN_PREFETCH 8192 + /* pointer to "variable cache" in shared memory (set up by shmem.c) */ VariableCache ShmemVariableCache = NULL; @@ -521,8 +524,7 @@ ForceTransactionIdLimitUpdate(void) * wide, counter wraparound will occur eventually, and therefore it is unwise * to assume they are unique unless precautions are taken to make them so. * Hence, this routine should generally not be used directly. The only direct - * callers should be GetNewOidWithIndex() and GetNewRelFileNode() in - * catalog/catalog.c. + * callers should be GetNewOidWithIndex() in catalog/catalog.c. */ Oid GetNewObjectId(void) @@ -613,6 +615,52 @@ SetNextObjectId(Oid nextOid) } /* + * GetNewRelNode + * + * Simmilar to GetNewObjectId but instead of new Oid it generates new relnode. + */ +RelNode +GetNewRelNode(void) +{ + RelNode result; + + /* safety check, we should never get this far in a HS standby */ + if (RecoveryInProgress()) + elog(ERROR, "cannot assign RelFileNode during recovery"); + + LWLockAcquire(RelNodeGenLock, LW_EXCLUSIVE); + + /* + * Check for the wraparound for the relnode counter. + * + * XXX Actually the relnode is 56 bits wide so we don't need to worry about + * the wraparound case. + */ + if (ShmemVariableCache->nextRelNode > MAX_RELFILENODE) + { + ShmemVariableCache->nextRelNode = FirstNormalRelfileNode; + ShmemVariableCache->relnodecount = 0; + } + + /* If we run out of logged for use RelNode then we must log more */ + if (ShmemVariableCache->relnodecount == 0) + { + XLogPutNextRelFileNode(ShmemVariableCache->nextRelNode + + VAR_RFN_PREFETCH); + + ShmemVariableCache->relnodecount = VAR_RFN_PREFETCH; + } + + result = ShmemVariableCache->nextRelNode; + (ShmemVariableCache->nextRelNode)++; + (ShmemVariableCache->relnodecount)--; + + LWLockRelease(RelNodeGenLock); + + return result; +} + +/* * StopGeneratingPinnedObjectIds * * This is called once during initdb to force the OID counter up to diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index dfe2a0b..290e4fc 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -1541,8 +1541,9 @@ checkXLogConsistency(XLogReaderState *record) if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0) { elog(FATAL, - "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u", - rnode.spcNode, rnode.dbNode, rnode.relNode, + "inconsistent page found, rel %u/%u/" INT64_FORMAT ", forknum %u, blkno %u", + rnode.spcNode, rnode.dbNode, + RELFILENODE_GETRELNODE(rnode), forknum, blkno); } } @@ -5396,6 +5397,7 @@ BootStrapXLOG(void) checkPoint.nextXid = FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId); checkPoint.nextOid = FirstGenbkiObjectId; + checkPoint.nextRelNode = FirstNormalRelfileNode; checkPoint.nextMulti = FirstMultiXactId; checkPoint.nextMultiOffset = 0; checkPoint.oldestXid = FirstNormalTransactionId; @@ -5409,7 +5411,9 @@ BootStrapXLOG(void) ShmemVariableCache->nextXid = checkPoint.nextXid; ShmemVariableCache->nextOid = checkPoint.nextOid; + ShmemVariableCache->nextRelNode = checkPoint.nextRelNode; ShmemVariableCache->oidCount = 0; + ShmemVariableCache->relnodecount = 0; MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); AdvanceOldestClogXid(checkPoint.oldestXid); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); @@ -7147,7 +7151,9 @@ StartupXLOG(void) /* initialize shared memory variables from the checkpoint record */ ShmemVariableCache->nextXid = checkPoint.nextXid; ShmemVariableCache->nextOid = checkPoint.nextOid; + ShmemVariableCache->nextRelNode = checkPoint.nextRelNode; ShmemVariableCache->oidCount = 0; + ShmemVariableCache->relnodecount = 0; MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); AdvanceOldestClogXid(checkPoint.oldestXid); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); @@ -9259,6 +9265,12 @@ CreateCheckPoint(int flags) checkPoint.nextOid += ShmemVariableCache->oidCount; LWLockRelease(OidGenLock); + LWLockAcquire(RelNodeGenLock, LW_SHARED); + checkPoint.nextRelNode = ShmemVariableCache->nextRelNode; + if (!shutdown) + checkPoint.nextRelNode += ShmemVariableCache->relnodecount; + LWLockRelease(RelNodeGenLock); + MultiXactGetCheckptMulti(shutdown, &checkPoint.nextMulti, &checkPoint.nextMultiOffset, @@ -9405,11 +9417,6 @@ CreateCheckPoint(int flags) END_CRIT_SECTION(); /* - * Let smgr do post-checkpoint cleanup (eg, deleting old files). - */ - SyncPostCheckpoint(); - - /* * Update the average distance between checkpoints if the prior checkpoint * exists. */ @@ -10070,6 +10077,18 @@ XLogPutNextOid(Oid nextOid) } /* + * Simmialr to the XLogPutNextOid but instead of writing NEXTOID log record it + * writes a NEXT_RELFILENODE log record. + */ +void +XLogPutNextRelFileNode(RelNode nextrelnode) +{ + XLogBeginInsert(); + XLogRegisterData((char *) (&nextrelnode), sizeof(RelNode)); + (void) XLogInsert(RM_XLOG_ID, XLOG_NEXT_RELFILENODE); +} + +/* * Write an XLOG SWITCH record. * * Here we just blindly issue an XLogInsert request for the record. @@ -10331,6 +10350,16 @@ xlog_redo(XLogReaderState *record) ShmemVariableCache->oidCount = 0; LWLockRelease(OidGenLock); } + if (info == XLOG_NEXT_RELFILENODE) + { + RelNode nextRelNode; + + memcpy(&nextRelNode, XLogRecGetData(record), sizeof(RelNode)); + LWLockAcquire(RelNodeGenLock, LW_EXCLUSIVE); + ShmemVariableCache->nextRelNode = nextRelNode; + ShmemVariableCache->relnodecount = 0; + LWLockRelease(RelNodeGenLock); + } else if (info == XLOG_CHECKPOINT_SHUTDOWN) { CheckPoint checkPoint; @@ -10344,6 +10373,10 @@ xlog_redo(XLogReaderState *record) ShmemVariableCache->nextOid = checkPoint.nextOid; ShmemVariableCache->oidCount = 0; LWLockRelease(OidGenLock); + LWLockAcquire(RelNodeGenLock, LW_EXCLUSIVE); + ShmemVariableCache->nextRelNode = checkPoint.nextRelNode; + ShmemVariableCache->relnodecount = 0; + LWLockRelease(RelNodeGenLock); MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); @@ -10713,15 +10746,17 @@ xlog_block_info(StringInfo buf, XLogReaderState *record) XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk); if (forknum != MAIN_FORKNUM) - appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u", + appendStringInfo(buf, "; blkref #%d: rel %u/%u/" INT64_FORMAT ", fork %u, blk %u", block_id, - rnode.spcNode, rnode.dbNode, rnode.relNode, + rnode.spcNode, rnode.dbNode, + RELFILENODE_GETRELNODE(rnode), forknum, blk); else - appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u", + appendStringInfo(buf, "; blkref #%d: rel %u/%u/" INT64_FORMAT ", blk %u", block_id, - rnode.spcNode, rnode.dbNode, rnode.relNode, + rnode.spcNode, rnode.dbNode, + RELFILENODE_GETRELNODE(rnode), blk); if (XLogRecHasBlockImage(record, block_id)) appendStringInfoString(buf, " FPW"); diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 90e1c483..0c4d8e2 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -593,17 +593,18 @@ CreateFakeRelcacheEntry(RelFileNode rnode) rel->rd_rel->relpersistence = RELPERSISTENCE_PERMANENT; /* We don't know the name of the relation; use relfilenode instead */ - sprintf(RelationGetRelationName(rel), "%u", rnode.relNode); + sprintf(RelationGetRelationName(rel), INT64_FORMAT, + RELFILENODE_GETRELNODE(rnode)); /* * We set up the lockRelId in case anything tries to lock the dummy - * relation. Note that this is fairly bogus since relNode may be - * different from the relation's OID. It shouldn't really matter though. + * relation. Note we are setting relId to just FirstNormalObjectId which + * is completely bogus. It shouldn't really matter though. * In recovery, we are running by ourselves and can't have any lock * conflicts. While syncing, we already hold AccessExclusiveLock. */ rel->rd_lockInfo.lockRelId.dbId = rnode.dbNode; - rel->rd_lockInfo.lockRelId.relId = rnode.relNode; + rel->rd_lockInfo.lockRelId.relId = FirstNormalObjectId; rel->rd_smgr = NULL; diff --git a/src/backend/catalog/catalog.c b/src/backend/catalog/catalog.c index dfd5fb6..5afbd07 100644 --- a/src/backend/catalog/catalog.c +++ b/src/backend/catalog/catalog.c @@ -472,27 +472,18 @@ GetNewOidWithIndex(Relation relation, Oid indexId, AttrNumber oidcolumn) /* * GetNewRelFileNode - * Generate a new relfilenode number that is unique within the - * database of the given tablespace. + * Generate a new relfilenode number. * - * If the relfilenode will also be used as the relation's OID, pass the - * opened pg_class catalog, and this routine will guarantee that the result - * is also an unused OID within pg_class. If the result is to be used only - * as a relfilenode for an existing relation, pass NULL for pg_class. - * - * As with GetNewOidWithIndex(), there is some theoretical risk of a race - * condition, but it doesn't seem worth worrying about. - * - * Note: we don't support using this in bootstrap mode. All relations - * created by bootstrap have preassigned OIDs, so there's no need. + * We are using 56 bits for the relfilenode so we expect that to be unique for + * the cluster so if it is already exists then report and error. */ -Oid -GetNewRelFileNode(Oid reltablespace, Relation pg_class, char relpersistence) +RelNode +GetNewRelFileNode(Oid reltablespace, char relpersistence) { RelFileNodeBackend rnode; char *rpath; - bool collides; BackendId backend; + RelNode relNode; /* * If we ever get here during pg_upgrade, there's something wrong; all @@ -525,42 +516,16 @@ GetNewRelFileNode(Oid reltablespace, Relation pg_class, char relpersistence) * are properly detected. */ rnode.backend = backend; + relNode = GetNewRelNode(); + RELFILENODE_SETRELNODE(rnode.node, relNode); - do - { - CHECK_FOR_INTERRUPTS(); - - /* Generate the OID */ - if (pg_class) - rnode.node.relNode = GetNewOidWithIndex(pg_class, ClassOidIndexId, - Anum_pg_class_oid); - else - rnode.node.relNode = GetNewObjectId(); - - /* Check for existing file of same name */ - rpath = relpath(rnode, MAIN_FORKNUM); + /* Check for existing file of same name */ + rpath = relpath(rnode, MAIN_FORKNUM); - if (access(rpath, F_OK) == 0) - { - /* definite collision */ - collides = true; - } - else - { - /* - * Here we have a little bit of a dilemma: if errno is something - * other than ENOENT, should we declare a collision and loop? In - * practice it seems best to go ahead regardless of the errno. If - * there is a colliding file we will get an smgr failure when we - * attempt to create the new relation file. - */ - collides = false; - } - - pfree(rpath); - } while (collides); + if (access(rpath, F_OK) == 0) + elog(ERROR, "new relfilenode file already exists: \"%s\"\n", rpath); - return rnode.node.relNode; + return relNode; } /* diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index 7e99de8..4976df0 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -93,7 +93,7 @@ Oid binary_upgrade_next_heap_pg_class_oid = InvalidOid; Oid binary_upgrade_next_heap_pg_class_relfilenode = InvalidOid; Oid binary_upgrade_next_toast_pg_class_oid = InvalidOid; -Oid binary_upgrade_next_toast_pg_class_relfilenode = InvalidOid; +RelNode binary_upgrade_next_toast_pg_class_relfilenode = InvalidRelfileNode; static void AddNewRelationTuple(Relation pg_class_desc, Relation new_rel_desc, @@ -303,7 +303,7 @@ heap_create(const char *relname, Oid relnamespace, Oid reltablespace, Oid relid, - Oid relfilenode, + RelNode relfilenode, Oid accessmtd, TupleDesc tupDesc, char relkind, @@ -358,8 +358,8 @@ heap_create(const char *relname, * If relfilenode is unspecified by the caller then create storage * with oid same as relid. */ - if (!OidIsValid(relfilenode)) - relfilenode = relid; + if (!RelfileNodeIsValid(relfilenode)) + relfilenode = GetNewRelFileNode(reltablespace, relpersistence); } /* @@ -912,7 +912,7 @@ InsertPgClassTuple(Relation pg_class_desc, values[Anum_pg_class_reloftype - 1] = ObjectIdGetDatum(rd_rel->reloftype); values[Anum_pg_class_relowner - 1] = ObjectIdGetDatum(rd_rel->relowner); values[Anum_pg_class_relam - 1] = ObjectIdGetDatum(rd_rel->relam); - values[Anum_pg_class_relfilenode - 1] = ObjectIdGetDatum(rd_rel->relfilenode); + values[Anum_pg_class_relfilenode - 1] = Int64GetDatum(rd_rel->relfilenode); values[Anum_pg_class_reltablespace - 1] = ObjectIdGetDatum(rd_rel->reltablespace); values[Anum_pg_class_relpages - 1] = Int32GetDatum(rd_rel->relpages); values[Anum_pg_class_reltuples - 1] = Float4GetDatum(rd_rel->reltuples); @@ -1129,7 +1129,7 @@ heap_create_with_catalog(const char *relname, Oid new_type_oid; /* By default set to InvalidOid unless overridden by binary-upgrade */ - Oid relfilenode = InvalidOid; + RelNode relfilenode = InvalidRelfileNode; TransactionId relfrozenxid; MultiXactId relminmxid; @@ -1187,8 +1187,7 @@ heap_create_with_catalog(const char *relname, /* * Allocate an OID for the relation, unless we were told what to use. * - * The OID will be the relfilenode as well, so make sure it doesn't - * collide with either pg_class OIDs or existing physical files. + * Make sure that the Oid doesn't collide with either pg_class OIDs. */ if (!OidIsValid(relid)) { @@ -1210,13 +1209,13 @@ heap_create_with_catalog(const char *relname, relid = binary_upgrade_next_toast_pg_class_oid; binary_upgrade_next_toast_pg_class_oid = InvalidOid; - if (!OidIsValid(binary_upgrade_next_toast_pg_class_relfilenode)) + if (!RelfileNodeIsValid(binary_upgrade_next_toast_pg_class_relfilenode)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("toast relfilenode value not set when in binary upgrade mode"))); relfilenode = binary_upgrade_next_toast_pg_class_relfilenode; - binary_upgrade_next_toast_pg_class_relfilenode = InvalidOid; + binary_upgrade_next_toast_pg_class_relfilenode = InvalidRelfileNode; } } else @@ -1243,8 +1242,8 @@ heap_create_with_catalog(const char *relname, } if (!OidIsValid(relid)) - relid = GetNewRelFileNode(reltablespace, pg_class_desc, - relpersistence); + relid = GetNewOidWithIndex(pg_class_desc, ClassOidIndexId, + Anum_pg_class_oid); } /* diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 2308d40..6e43237 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -661,7 +661,7 @@ UpdateIndexRelation(Oid indexoid, * parent index; otherwise InvalidOid. * parentConstraintId: if creating a constraint on a partition, the OID * of the constraint in the parent; otherwise InvalidOid. - * relFileNode: normally, pass InvalidOid to get new storage. May be + * relFileNode: normally, pass InvalidRelfileNode to get new storage. May be * nonzero to attach an existing valid build. * indexInfo: same info executor uses to insert into the index * indexColNames: column names to use for index (List of char *) @@ -702,7 +702,7 @@ index_create(Relation heapRelation, Oid indexRelationId, Oid parentIndexRelid, Oid parentConstraintId, - Oid relFileNode, + RelNode relFileNode, IndexInfo *indexInfo, List *indexColNames, Oid accessMethodObjectId, @@ -734,7 +734,7 @@ index_create(Relation heapRelation, char relkind; TransactionId relfrozenxid; MultiXactId relminmxid; - bool create_storage = !OidIsValid(relFileNode); + bool create_storage = !RelfileNodeIsValid(relFileNode); /* constraint flags can only be set when a constraint is requested */ Assert((constr_flags == 0) || @@ -901,8 +901,7 @@ index_create(Relation heapRelation, /* * Allocate an OID for the index, unless we were told what to use. * - * The OID will be the relfilenode as well, so make sure it doesn't - * collide with either pg_class OIDs or existing physical files. + * Make sure it doesn't collide with either pg_class OIDs. */ if (!OidIsValid(indexRelationId)) { @@ -935,8 +934,8 @@ index_create(Relation heapRelation, } else { - indexRelationId = - GetNewRelFileNode(tableSpaceId, pg_class, relpersistence); + indexRelationId = GetNewOidWithIndex(pg_class, ClassOidIndexId, + Anum_pg_class_oid); } } @@ -1406,7 +1405,7 @@ index_concurrently_create_copy(Relation heapRelation, Oid oldIndexId, InvalidOid, /* indexRelationId */ InvalidOid, /* parentIndexRelid */ InvalidOid, /* parentConstraintId */ - InvalidOid, /* relFileNode */ + InvalidRelfileNode, /* relFileNode */ newInfo, indexColNames, indexRelation->rd_rel->relam, diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index 9b80755..712e995 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -593,7 +593,8 @@ RestorePendingSyncs(char *startAddress) RelFileNode *rnode; Assert(pendingSyncHash == NULL); - for (rnode = (RelFileNode *) startAddress; rnode->relNode != 0; rnode++) + for (rnode = (RelFileNode *) startAddress; + RELFILENODE_GETRELNODE(*rnode) != 0; rnode++) AddPendingSync(rnode); } diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c index 2e8efe4..2423003 100644 --- a/src/backend/commands/cluster.c +++ b/src/backend/commands/cluster.c @@ -1006,9 +1006,9 @@ swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class, reltup2; Form_pg_class relform1, relform2; - Oid relfilenode1, + RelNode relfilenode1, relfilenode2; - Oid swaptemp; + RelNode swaptemp; char swptmpchr; /* We need writable copies of both pg_class tuples. */ diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 42aacc8..a1c9c24 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -1085,7 +1085,7 @@ DefineIndex(Oid relationId, * A valid stmt->oldNode implies that we already have a built form of the * index. The caller should also decline any index build. */ - Assert(!OidIsValid(stmt->oldNode) || (skip_build && !concurrent)); + Assert(!RelfileNodeIsValid(stmt->oldNode) || (skip_build && !concurrent)); /* * Make the catalog entries for the index, including constraints. This @@ -1315,7 +1315,7 @@ DefineIndex(Oid relationId, childStmt->idxname = NULL; childStmt->relation = NULL; childStmt->indexOid = InvalidOid; - childStmt->oldNode = InvalidOid; + childStmt->oldNode = InvalidRelfileNode; childStmt->oldCreateSubid = InvalidSubTransactionId; childStmt->oldFirstRelfilenodeSubid = InvalidSubTransactionId; @@ -2896,7 +2896,7 @@ ReindexMultipleTables(const char *objectName, ReindexObjectType objectKind, * particular this eliminates all shared catalogs.). */ if (RELKIND_HAS_STORAGE(classtuple->relkind) && - !OidIsValid(classtuple->relfilenode)) + !RelfileNodeIsValid(classtuple->relfilenode)) skip_rel = true; /* diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index 27cb630..72137f6 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -74,7 +74,7 @@ typedef struct sequence_magic typedef struct SeqTableData { Oid relid; /* pg_class OID of this sequence (hash key) */ - Oid filenode; /* last seen relfilenode of this sequence */ + RelNode filenode; /* last seen relfilenode of this sequence */ LocalTransactionId lxid; /* xact in which we last did a seq op */ bool last_valid; /* do we have a valid "last" value? */ int64 last; /* value last returned by nextval */ diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 1f0654c..36987f3 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -3304,7 +3304,7 @@ CheckRelationTableSpaceMove(Relation rel, Oid newTableSpaceId) void SetRelationTableSpace(Relation rel, Oid newTableSpaceId, - Oid newRelFileNode) + RelNode newRelFileNode) { Relation pg_class; HeapTuple tuple; @@ -3324,7 +3324,7 @@ SetRelationTableSpace(Relation rel, /* Update the pg_class row. */ rd_rel->reltablespace = (newTableSpaceId == MyDatabaseTableSpace) ? InvalidOid : newTableSpaceId; - if (OidIsValid(newRelFileNode)) + if (newRelFileNode != InvalidRelfileNode) rd_rel->relfilenode = newRelFileNode; CatalogTupleUpdate(pg_class, &tuple->t_self, tuple); @@ -8572,7 +8572,7 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel, /* suppress schema rights check when rebuilding existing index */ check_rights = !is_rebuild; /* skip index build if phase 3 will do it or we're reusing an old one */ - skip_build = tab->rewrite > 0 || OidIsValid(stmt->oldNode); + skip_build = tab->rewrite > 0 || RelfileNodeIsValid(stmt->oldNode); /* suppress notices when rebuilding existing index */ quiet = is_rebuild; @@ -8596,7 +8596,7 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel, * DROP of the old edition of this index will have scheduled the storage * for deletion at commit, so cancel that pending deletion. */ - if (OidIsValid(stmt->oldNode)) + if (RelfileNodeIsValid(stmt->oldNode)) { Relation irel = index_open(address.objectId, NoLock); @@ -13441,7 +13441,7 @@ TryReuseIndex(Oid oldId, IndexStmt *stmt) /* If it's a partitioned index, there is no storage to share. */ if (irel->rd_rel->relkind != RELKIND_PARTITIONED_INDEX) { - stmt->oldNode = irel->rd_node.relNode; + stmt->oldNode = RELFILENODE_GETRELNODE(irel->rd_node); stmt->oldCreateSubid = irel->rd_createSubid; stmt->oldFirstRelfilenodeSubid = irel->rd_firstRelfilenodeSubid; } @@ -14290,7 +14290,7 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode) { Relation rel; Oid reltoastrelid; - Oid newrelfilenode; + RelNode newrelfilenode; RelFileNode newrnode; List *reltoastidxids = NIL; ListCell *lc; @@ -14320,15 +14320,18 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode) } /* - * Relfilenodes are not unique in databases across tablespaces, so we need - * to allocate a new one in the new tablespace. + * Generate a new relfilenode. Although relfilenodes are unique within a + * cluster, we are unable to use the old relfilenode since unused + * relfilenodes are not unlinked until commit. So if within a transaction, + * if we set the old tablespace again, we will get conflicting relfilenode + * file. */ - newrelfilenode = GetNewRelFileNode(newTableSpace, NULL, + newrelfilenode = GetNewRelFileNode(newTableSpace, rel->rd_rel->relpersistence); /* Open old and new relation */ newrnode = rel->rd_node; - newrnode.relNode = newrelfilenode; + RELFILENODE_SETRELNODE(newrnode, newrelfilenode); newrnode.spcNode = newTableSpace; /* hand off to AM to actually create the new filenode and copy the data */ diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 2b02369..9b64842 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -2771,7 +2771,7 @@ _outIndexStmt(StringInfo str, const IndexStmt *node) WRITE_NODE_FIELD(excludeOpNames); WRITE_STRING_FIELD(idxcomment); WRITE_OID_FIELD(indexOid); - WRITE_OID_FIELD(oldNode); + WRITE_UINT64_FIELD(oldNode); WRITE_UINT_FIELD(oldCreateSubid); WRITE_UINT_FIELD(oldFirstRelfilenodeSubid); WRITE_BOOL_FIELD(unique); diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index 0eea214..209eabf 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -1577,7 +1577,7 @@ generateClonedIndexStmt(RangeVar *heapRel, Relation source_idx, index->excludeOpNames = NIL; index->idxcomment = NULL; index->indexOid = InvalidOid; - index->oldNode = InvalidOid; + index->oldNode = InvalidRelfileNode; index->oldCreateSubid = InvalidSubTransactionId; index->oldFirstRelfilenodeSubid = InvalidSubTransactionId; index->unique = idxrec->indisunique; @@ -2197,7 +2197,7 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt) index->excludeOpNames = NIL; index->idxcomment = NULL; index->indexOid = InvalidOid; - index->oldNode = InvalidOid; + index->oldNode = InvalidRelfileNode; index->oldCreateSubid = InvalidSubTransactionId; index->oldFirstRelfilenodeSubid = InvalidSubTransactionId; index->transformed = false; diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index 3fb5a92..23822c1 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -154,6 +154,7 @@ xlog_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) break; case XLOG_NOOP: case XLOG_NEXTOID: + case XLOG_NEXT_RELFILENODE: case XLOG_SWITCH: case XLOG_BACKUP_END: case XLOG_PARAMETER_CHANGE: diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c index 19b2ba2..143d403 100644 --- a/src/backend/replication/logical/reorderbuffer.c +++ b/src/backend/replication/logical/reorderbuffer.c @@ -2134,7 +2134,7 @@ ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, Assert(snapshot_now); reloid = RelidByRelfilenode(change->data.tp.relnode.spcNode, - change->data.tp.relnode.relNode); + RELFILENODE_GETRELNODE(change->data.tp.relnode)); /* * Mapped catalog tuple without data, emitted while diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index a2512e7..ada326b 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -818,7 +818,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum, smgr->smgr_rnode.node.spcNode, smgr->smgr_rnode.node.dbNode, - smgr->smgr_rnode.node.relNode, + RELFILENODE_GETRELNODE(smgr->smgr_rnode.node), smgr->smgr_rnode.backend, isExtend); @@ -880,7 +880,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum, smgr->smgr_rnode.node.spcNode, smgr->smgr_rnode.node.dbNode, - smgr->smgr_rnode.node.relNode, + RELFILENODE_GETRELNODE(smgr->smgr_rnode.node), smgr->smgr_rnode.backend, isExtend, found); @@ -1070,7 +1070,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum, smgr->smgr_rnode.node.spcNode, smgr->smgr_rnode.node.dbNode, - smgr->smgr_rnode.node.relNode, + RELFILENODE_GETRELNODE(smgr->smgr_rnode.node), smgr->smgr_rnode.backend, isExtend, found); @@ -1249,7 +1249,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_START(forkNum, blockNum, smgr->smgr_rnode.node.spcNode, smgr->smgr_rnode.node.dbNode, - smgr->smgr_rnode.node.relNode); + RELFILENODE_GETRELNODE(smgr->smgr_rnode.node)); FlushBuffer(buf, NULL); LWLockRelease(BufferDescriptorGetContentLock(buf)); @@ -1260,7 +1260,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum, smgr->smgr_rnode.node.spcNode, smgr->smgr_rnode.node.dbNode, - smgr->smgr_rnode.node.relNode); + RELFILENODE_GETRELNODE(smgr->smgr_rnode.node)); } else { @@ -1640,7 +1640,7 @@ ReleaseAndReadBuffer(Buffer buffer, bufHdr = GetLocalBufferDescriptor(-buffer - 1); if (bufHdr->tag.blockNum == blockNum && RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) && - bufHdr->tag.forkNum == forkNum) + RELFILENODE_GETFORKNUM(bufHdr->tag.rnode) == forkNum) return buffer; ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer); LocalRefCount[-buffer - 1]--; @@ -1651,7 +1651,7 @@ ReleaseAndReadBuffer(Buffer buffer, /* we have pin, so it's ok to examine tag without spinlock */ if (bufHdr->tag.blockNum == blockNum && RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) && - bufHdr->tag.forkNum == forkNum) + RELFILENODE_GETFORKNUM(bufHdr->tag.rnode) == forkNum) return buffer; UnpinBuffer(bufHdr, true); } @@ -1993,8 +1993,8 @@ BufferSync(int flags) item = &CkptBufferIds[num_to_scan++]; item->buf_id = buf_id; item->tsId = bufHdr->tag.rnode.spcNode; - item->relNode = bufHdr->tag.rnode.relNode; - item->forkNum = bufHdr->tag.forkNum; + item->relNode = RELFILENODE_GETRELNODE(bufHdr->tag.rnode); + item->forkNum = RELFILENODE_GETFORKNUM(bufHdr->tag.rnode); item->blockNum = bufHdr->tag.blockNum; } @@ -2701,7 +2701,8 @@ PrintBufferLeakWarning(Buffer buffer) } /* theoretically we should lock the bufhdr here */ - path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum); + path = relpathbackend(buf->tag.rnode, backend, + RELFILENODE_GETFORKNUM(buf->tag.rnode)); buf_state = pg_atomic_read_u32(&buf->state); elog(WARNING, "buffer refcount leak: [%03d] " @@ -2781,7 +2782,14 @@ BufferGetTag(Buffer buffer, RelFileNode *rnode, ForkNumber *forknum, /* pinned, so OK to read tag without spinlock */ *rnode = bufHdr->tag.rnode; - *forknum = bufHdr->tag.forkNum; + + /* + * Clear the fork number from the output rnode->relNode. For more details + * refer comments atop RelFileNode. + */ + RELFILENODE_SETRELNODE(*rnode, RELFILENODE_GETRELNODE(*rnode)); + + *forknum = RELFILENODE_GETFORKNUM(bufHdr->tag.rnode); *blknum = bufHdr->tag.blockNum; } @@ -2833,11 +2841,11 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln) if (reln == NULL) reln = smgropen(buf->tag.rnode, InvalidBackendId); - TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum, + TRACE_POSTGRESQL_BUFFER_FLUSH_START(RELFILENODE_GETFORKNUM(buf->tag.rnode), buf->tag.blockNum, reln->smgr_rnode.node.spcNode, reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode); + RELFILENODE_GETRELNODE(reln->smgr_rnode.node)); buf_state = LockBufHdr(buf); @@ -2892,7 +2900,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln) * bufToWrite is either the shared buffer or a copy, as appropriate. */ smgrwrite(reln, - buf->tag.forkNum, + RELFILENODE_GETFORKNUM(buf->tag.rnode), buf->tag.blockNum, bufToWrite, false); @@ -2913,11 +2921,11 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln) */ TerminateBufferIO(buf, true, 0); - TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf->tag.forkNum, + TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(RELFILENODE_GETFORKNUM(buf->tag.rnode), buf->tag.blockNum, reln->smgr_rnode.node.spcNode, reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode); + RELFILENODE_GETRELNODE(reln->smgr_rnode.node)); /* Pop the error context stack */ error_context_stack = errcallback.previous; @@ -3142,7 +3150,7 @@ DropRelFileNodeBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, for (j = 0; j < nforks; j++) { if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) && - bufHdr->tag.forkNum == forkNum[j] && + RELFILENODE_GETFORKNUM(bufHdr->tag.rnode) == forkNum[j] && bufHdr->tag.blockNum >= firstDelBlock[j]) { InvalidateBuffer(bufHdr); /* releases spinlock */ @@ -3374,7 +3382,7 @@ FindAndDropRelFileNodeBuffers(RelFileNode rnode, ForkNumber forkNum, buf_state = LockBufHdr(bufHdr); if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) && - bufHdr->tag.forkNum == forkNum && + RELFILENODE_GETFORKNUM(bufHdr->tag.rnode) == forkNum && bufHdr->tag.blockNum >= firstDelBlock) InvalidateBuffer(bufHdr); /* releases spinlock */ else @@ -3528,7 +3536,7 @@ FlushRelationBuffers(Relation rel) PageSetChecksumInplace(localpage, bufHdr->tag.blockNum); smgrwrite(RelationGetSmgr(rel), - bufHdr->tag.forkNum, + RELFILENODE_GETFORKNUM(bufHdr->tag.rnode), bufHdr->tag.blockNum, localpage, false); @@ -4491,7 +4499,8 @@ AbortBufferIO(void) /* Buffer is pinned, so we can read tag without spinlock */ char *path; - path = relpathperm(buf->tag.rnode, buf->tag.forkNum); + path = relpathperm(buf->tag.rnode, + RELFILENODE_GETFORKNUM(buf->tag.rnode)); ereport(WARNING, (errcode(ERRCODE_IO_ERROR), errmsg("could not write block %u of %s", @@ -4515,7 +4524,8 @@ shared_buffer_write_error_callback(void *arg) /* Buffer is pinned, so we can read the tag without locking the spinlock */ if (bufHdr != NULL) { - char *path = relpathperm(bufHdr->tag.rnode, bufHdr->tag.forkNum); + char *path = relpathperm(bufHdr->tag.rnode, + RELFILENODE_GETFORKNUM(bufHdr->tag.rnode)); errcontext("writing block %u of relation %s", bufHdr->tag.blockNum, path); @@ -4534,7 +4544,7 @@ local_buffer_write_error_callback(void *arg) if (bufHdr != NULL) { char *path = relpathbackend(bufHdr->tag.rnode, MyBackendId, - bufHdr->tag.forkNum); + RELFILENODE_GETFORKNUM(bufHdr->tag.rnode)); errcontext("writing block %u of relation %s", bufHdr->tag.blockNum, path); @@ -4551,9 +4561,9 @@ rnode_comparator(const void *p1, const void *p2) RelFileNode n1 = *(const RelFileNode *) p1; RelFileNode n2 = *(const RelFileNode *) p2; - if (n1.relNode < n2.relNode) + if (RELFILENODE_GETRELNODE(n1) < RELFILENODE_GETRELNODE(n2)) return -1; - else if (n1.relNode > n2.relNode) + else if (RELFILENODE_GETRELNODE(n1) > RELFILENODE_GETRELNODE(n2)) return 1; if (n1.dbNode < n2.dbNode) @@ -4634,9 +4644,9 @@ buffertag_comparator(const BufferTag *ba, const BufferTag *bb) if (ret != 0) return ret; - if (ba->forkNum < bb->forkNum) + if (RELFILENODE_GETFORKNUM(ba->rnode) < RELFILENODE_GETFORKNUM(bb->rnode)) return -1; - if (ba->forkNum > bb->forkNum) + if (RELFILENODE_GETFORKNUM(ba->rnode) > RELFILENODE_GETFORKNUM(bb->rnode)) return 1; if (ba->blockNum < bb->blockNum) @@ -4801,7 +4811,8 @@ IssuePendingWritebacks(WritebackContext *context) /* different file, stop */ if (!RelFileNodeEquals(cur->tag.rnode, next->tag.rnode) || - cur->tag.forkNum != next->tag.forkNum) + RELFILENODE_GETFORKNUM(cur->tag.rnode) < + RELFILENODE_GETFORKNUM(next->tag.rnode)) break; /* ok, block queued twice, skip */ @@ -4820,7 +4831,8 @@ IssuePendingWritebacks(WritebackContext *context) /* and finally tell the kernel to write the data to storage */ reln = smgropen(tag.rnode, InvalidBackendId); - smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks); + smgrwriteback(reln, RELFILENODE_GETFORKNUM(tag.rnode), tag.blockNum, + nblocks); } context->nr_pending = 0; diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index e71f95a..2892733 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -221,7 +221,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, /* And write... */ smgrwrite(oreln, - bufHdr->tag.forkNum, + RELFILENODE_GETFORKNUM(bufHdr->tag.rnode), bufHdr->tag.blockNum, localpage, false); @@ -338,14 +338,14 @@ DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum, if ((buf_state & BM_TAG_VALID) && RelFileNodeEquals(bufHdr->tag.rnode, rnode) && - bufHdr->tag.forkNum == forkNum && + RELFILENODE_GETFORKNUM(bufHdr->tag.rnode) == forkNum && bufHdr->tag.blockNum >= firstDelBlock) { if (LocalRefCount[i] != 0) elog(ERROR, "block %u of %s is still referenced (local %u)", bufHdr->tag.blockNum, relpathbackend(bufHdr->tag.rnode, MyBackendId, - bufHdr->tag.forkNum), + RELFILENODE_GETFORKNUM(bufHdr->tag.rnode)), LocalRefCount[i]); /* Remove entry from hashtable */ hresult = (LocalBufferLookupEnt *) @@ -389,7 +389,7 @@ DropRelFileNodeAllLocalBuffers(RelFileNode rnode) elog(ERROR, "block %u of %s is still referenced (local %u)", bufHdr->tag.blockNum, relpathbackend(bufHdr->tag.rnode, MyBackendId, - bufHdr->tag.forkNum), + RELFILENODE_GETFORKNUM(bufHdr->tag.rnode)), LocalRefCount[i]); /* Remove entry from hashtable */ hresult = (LocalBufferLookupEnt *) diff --git a/src/backend/storage/freespace/fsmpage.c b/src/backend/storage/freespace/fsmpage.c index d165b35..41942f5 100644 --- a/src/backend/storage/freespace/fsmpage.c +++ b/src/backend/storage/freespace/fsmpage.c @@ -273,8 +273,8 @@ restart: BlockNumber blknum; BufferGetTag(buf, &rnode, &forknum, &blknum); - elog(DEBUG1, "fixing corrupt FSM block %u, relation %u/%u/%u", - blknum, rnode.spcNode, rnode.dbNode, rnode.relNode); + elog(DEBUG1, "fixing corrupt FSM block %u, relation %u/%u/" INT64_FORMAT, + blknum, rnode.spcNode, rnode.dbNode, RELFILENODE_GETRELNODE(rnode)); /* make sure we hold an exclusive lock */ if (!exclusive_lock_held) diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index 6c7cf6c..1eb6d78 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -53,3 +53,4 @@ XactTruncationLock 44 # 45 was XactTruncationLock until removal of BackendRandomLock WrapLimitsVacuumLock 46 NotifyQueueTailLock 47 +RelNodeGenLock 48 \ No newline at end of file diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index d26c915..8e2c60f 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -124,8 +124,6 @@ static void mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, static MdfdVec *mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior); static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg); -static void register_unlink_segment(RelFileNodeBackend rnode, ForkNumber forknum, - BlockNumber segno); static void register_forget_request(RelFileNodeBackend rnode, ForkNumber forknum, BlockNumber segno); static void _fdvec_resize(SMgrRelation reln, @@ -321,36 +319,25 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) /* * Delete or truncate the first segment. */ - if (isRedo || forkNum != MAIN_FORKNUM || RelFileNodeBackendIsTemp(rnode)) + if (!RelFileNodeBackendIsTemp(rnode)) { - if (!RelFileNodeBackendIsTemp(rnode)) - { - /* Prevent other backends' fds from holding on to the disk space */ - ret = do_truncate(path); - - /* Forget any pending sync requests for the first segment */ - register_forget_request(rnode, forkNum, 0 /* first seg */ ); - } - else - ret = 0; + /* Prevent other backends' fds from holding on to the disk space */ + ret = do_truncate(path); - /* Next unlink the file, unless it was already found to be missing */ - if (ret == 0 || errno != ENOENT) - { - ret = unlink(path); - if (ret < 0 && errno != ENOENT) - ereport(WARNING, - (errcode_for_file_access(), - errmsg("could not remove file \"%s\": %m", path))); - } + /* Forget any pending sync requests for the first segment */ + register_forget_request(rnode, forkNum, 0 /* first seg */ ); } else - { - /* Prevent other backends' fds from holding on to the disk space */ - ret = do_truncate(path); + ret = 0; - /* Register request to unlink first segment later */ - register_unlink_segment(rnode, forkNum, 0 /* first seg */ ); + /* Next unlink the file, unless it was already found to be missing */ + if (ret == 0 || errno != ENOENT) + { + ret = unlink(path); + if (ret < 0 && errno != ENOENT) + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", path))); } /* @@ -640,7 +627,7 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum, reln->smgr_rnode.node.spcNode, reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RELFILENODE_GETRELNODE(reln->smgr_rnode.node1), reln->smgr_rnode.backend); v = _mdfd_getseg(reln, forknum, blocknum, false, @@ -655,7 +642,7 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum, reln->smgr_rnode.node.spcNode, reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RELFILENODE_GETRELNODE(reln->smgr_rnode.node1), reln->smgr_rnode.backend, nbytes, BLCKSZ); @@ -710,7 +697,7 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum, reln->smgr_rnode.node.spcNode, reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RELFILENODE_GETRELNODE(reln->smgr_rnode.node1), reln->smgr_rnode.backend); v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, @@ -725,7 +712,7 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum, reln->smgr_rnode.node.spcNode, reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RELFILENODE_GETRELNODE(reln->smgr_rnode.node1), reln->smgr_rnode.backend, nbytes, BLCKSZ); @@ -995,23 +982,6 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) } /* - * register_unlink_segment() -- Schedule a file to be deleted after next checkpoint - */ -static void -register_unlink_segment(RelFileNodeBackend rnode, ForkNumber forknum, - BlockNumber segno) -{ - FileTag tag; - - INIT_MD_FILETAG(tag, rnode.node, forknum, segno); - - /* Should never be used with temp relations */ - Assert(!RelFileNodeBackendIsTemp(rnode)); - - RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ ); -} - -/* * register_forget_request() -- forget any fsyncs for a relation fork's segment */ static void @@ -1036,7 +1006,7 @@ ForgetDatabaseSyncRequests(Oid dbid) rnode.dbNode = dbid; rnode.spcNode = 0; - rnode.relNode = 0; + RELFILENODE_SETRELNODE(rnode, 0); INIT_MD_FILETAG(tag, rnode, InvalidForkNumber, InvalidBlockNumber); diff --git a/src/backend/storage/sync/sync.c b/src/backend/storage/sync/sync.c index 543f691..46a1242 100644 --- a/src/backend/storage/sync/sync.c +++ b/src/backend/storage/sync/sync.c @@ -188,92 +188,6 @@ SyncPreCheckpoint(void) } /* - * SyncPostCheckpoint() -- Do post-checkpoint work - * - * Remove any lingering files that can now be safely removed. - */ -void -SyncPostCheckpoint(void) -{ - int absorb_counter; - ListCell *lc; - - absorb_counter = UNLINKS_PER_ABSORB; - foreach(lc, pendingUnlinks) - { - PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(lc); - char path[MAXPGPATH]; - - /* Skip over any canceled entries */ - if (entry->canceled) - continue; - - /* - * New entries are appended to the end, so if the entry is new we've - * reached the end of old entries. - * - * Note: if just the right number of consecutive checkpoints fail, we - * could be fooled here by cycle_ctr wraparound. However, the only - * consequence is that we'd delay unlinking for one more checkpoint, - * which is perfectly tolerable. - */ - if (entry->cycle_ctr == checkpoint_cycle_ctr) - break; - - /* Unlink the file */ - if (syncsw[entry->tag.handler].sync_unlinkfiletag(&entry->tag, - path) < 0) - { - /* - * There's a race condition, when the database is dropped at the - * same time that we process the pending unlink requests. If the - * DROP DATABASE deletes the file before we do, we will get ENOENT - * here. rmtree() also has to ignore ENOENT errors, to deal with - * the possibility that we delete the file first. - */ - if (errno != ENOENT) - ereport(WARNING, - (errcode_for_file_access(), - errmsg("could not remove file \"%s\": %m", path))); - } - - /* Mark the list entry as canceled, just in case */ - entry->canceled = true; - - /* - * As in ProcessSyncRequests, we don't want to stop absorbing fsync - * requests for a long time when there are many deletions to be done. - * We can safely call AbsorbSyncRequests() at this point in the loop. - */ - if (--absorb_counter <= 0) - { - AbsorbSyncRequests(); - absorb_counter = UNLINKS_PER_ABSORB; - } - } - - /* - * If we reached the end of the list, we can just remove the whole list - * (remembering to pfree all the PendingUnlinkEntry objects). Otherwise, - * we must keep the entries at or after "lc". - */ - if (lc == NULL) - { - list_free_deep(pendingUnlinks); - pendingUnlinks = NIL; - } - else - { - int ntodelete = list_cell_number(pendingUnlinks, lc); - - for (int i = 0; i < ntodelete; i++) - pfree(list_nth(pendingUnlinks, i)); - - pendingUnlinks = list_delete_first_n(pendingUnlinks, ntodelete); - } -} - -/* * ProcessSyncRequests() -- Process queued fsync requests. */ void @@ -519,21 +433,6 @@ RememberSyncRequest(const FileTag *ftag, SyncRequestType type) entry->canceled = true; } } - else if (type == SYNC_UNLINK_REQUEST) - { - /* Unlink request: put it in the linked list */ - MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt); - PendingUnlinkEntry *entry; - - entry = palloc(sizeof(PendingUnlinkEntry)); - entry->tag = *ftag; - entry->cycle_ctr = checkpoint_cycle_ctr; - entry->canceled = false; - - pendingUnlinks = lappend(pendingUnlinks, entry); - - MemoryContextSwitchTo(oldcxt); - } else { /* Normal case: enter a request to fsync this segment */ diff --git a/src/backend/utils/adt/dbsize.c b/src/backend/utils/adt/dbsize.c index 3a2f2e1..03fcac7 100644 --- a/src/backend/utils/adt/dbsize.c +++ b/src/backend/utils/adt/dbsize.c @@ -850,7 +850,7 @@ Datum pg_relation_filenode(PG_FUNCTION_ARGS) { Oid relid = PG_GETARG_OID(0); - Oid result; + RelNode result; HeapTuple tuple; Form_pg_class relform; @@ -870,15 +870,15 @@ pg_relation_filenode(PG_FUNCTION_ARGS) else { /* no storage, return NULL */ - result = InvalidOid; + result = InvalidRelfileNode; } ReleaseSysCache(tuple); - if (!OidIsValid(result)) + if (!RelfileNodeIsValid(result)) PG_RETURN_NULL(); - PG_RETURN_OID(result); + PG_RETURN_INT64(result); } /* @@ -898,11 +898,11 @@ Datum pg_filenode_relation(PG_FUNCTION_ARGS) { Oid reltablespace = PG_GETARG_OID(0); - Oid relfilenode = PG_GETARG_OID(1); + RelNode relfilenode = PG_GETARG_INT64(1); Oid heaprel; /* test needed so RelidByRelfilenode doesn't misbehave */ - if (!OidIsValid(relfilenode)) + if (!RelfileNodeIsValid(relfilenode)) PG_RETURN_NULL(); heaprel = RelidByRelfilenode(reltablespace, relfilenode); @@ -945,21 +945,21 @@ pg_relation_filepath(PG_FUNCTION_ARGS) else rnode.dbNode = MyDatabaseId; if (relform->relfilenode) - rnode.relNode = relform->relfilenode; + RELFILENODE_SETRELNODE(rnode, relform->relfilenode); else /* Consult the relation mapper */ - rnode.relNode = RelationMapOidToFilenode(relid, - relform->relisshared); + RELFILENODE_SETRELNODE(rnode, RelationMapOidToFilenode(relid, + relform->relisshared)); } else { /* no storage, return NULL */ - rnode.relNode = InvalidOid; + RELFILENODE_SETRELNODE(rnode, InvalidRelfileNode); /* some compilers generate warnings without these next two lines */ rnode.dbNode = InvalidOid; rnode.spcNode = InvalidOid; } - if (!OidIsValid(rnode.relNode)) + if (RELFILENODE_GETRELNODE(rnode) == InvalidRelfileNode) { ReleaseSysCache(tuple); PG_RETURN_NULL(); diff --git a/src/backend/utils/adt/pg_upgrade_support.c b/src/backend/utils/adt/pg_upgrade_support.c index 67b9675e..ab8d148 100644 --- a/src/backend/utils/adt/pg_upgrade_support.c +++ b/src/backend/utils/adt/pg_upgrade_support.c @@ -142,10 +142,10 @@ binary_upgrade_set_next_toast_pg_class_oid(PG_FUNCTION_ARGS) Datum binary_upgrade_set_next_toast_relfilenode(PG_FUNCTION_ARGS) { - Oid nodeoid = PG_GETARG_OID(0); + RelNode relnode = PG_GETARG_INT64(0); CHECK_IS_BINARY_UPGRADE; - binary_upgrade_next_toast_pg_class_relfilenode = nodeoid; + binary_upgrade_next_toast_pg_class_relfilenode = relnode; PG_RETURN_VOID(); } diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 2e760e8..9f86ea7 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -1288,7 +1288,7 @@ retry: static void RelationInitPhysicalAddr(Relation relation) { - Oid oldnode = relation->rd_node.relNode; + Oid oldnode = RELFILENODE_GETRELNODE(relation->rd_node); /* these relations kinds never have storage */ if (!RELKIND_HAS_STORAGE(relation->rd_rel->relkind)) @@ -1335,15 +1335,16 @@ RelationInitPhysicalAddr(Relation relation) heap_freetuple(phys_tuple); } - relation->rd_node.relNode = relation->rd_rel->relfilenode; + RELFILENODE_SETRELNODE(relation->rd_node, + relation->rd_rel->relfilenode); } else { /* Consult the relation mapper */ - relation->rd_node.relNode = - RelationMapOidToFilenode(relation->rd_id, - relation->rd_rel->relisshared); - if (!OidIsValid(relation->rd_node.relNode)) + RELFILENODE_SETRELNODE(relation->rd_node, + RelationMapOidToFilenode(relation->rd_id, + relation->rd_rel->relisshared)); + if (RELFILENODE_GETRELNODE(relation->rd_node) == InvalidRelfileNode) elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u", RelationGetRelationName(relation), relation->rd_id); } @@ -1353,7 +1354,8 @@ RelationInitPhysicalAddr(Relation relation) * rd_firstRelfilenodeSubid. No subtransactions start or end while in * parallel mode, so the specific SubTransactionId does not matter. */ - if (IsParallelWorker() && oldnode != relation->rd_node.relNode) + if (IsParallelWorker() && oldnode != + RELFILENODE_GETRELNODE(relation->rd_node)) { if (RelFileNodeSkippingWAL(relation->rd_node)) relation->rd_firstRelfilenodeSubid = TopSubTransactionId; @@ -1958,13 +1960,13 @@ formrdesc(const char *relationName, Oid relationReltype, /* * All relations made with formrdesc are mapped. This is necessarily so * because there is no other way to know what filenode they currently - * have. In bootstrap mode, add them to the initial relation mapper data, - * specifying that the initial filenode is the same as the OID. + * have. In bootstrap mode, generate a new relfilenode and add them to the + * initial relation mapper data. */ - relation->rd_rel->relfilenode = InvalidOid; + relation->rd_rel->relfilenode = InvalidRelfileNode; if (IsBootstrapProcessingMode()) RelationMapUpdateMap(RelationGetRelid(relation), - RelationGetRelid(relation), + GetNewRelNode(), isshared, true); /* @@ -3433,7 +3435,7 @@ RelationBuildLocalRelation(const char *relname, TupleDesc tupDesc, Oid relid, Oid accessmtd, - Oid relfilenode, + RelNode relfilenode, Oid reltablespace, bool shared_relation, bool mapped_relation, @@ -3604,7 +3606,7 @@ RelationBuildLocalRelation(const char *relname, if (mapped_relation) { - rel->rd_rel->relfilenode = InvalidOid; + rel->rd_rel->relfilenode = InvalidRelfileNode; /* Add it to the active mapping information */ RelationMapUpdateMap(relid, relfilenode, shared_relation, true); } @@ -3673,7 +3675,7 @@ RelationBuildLocalRelation(const char *relname, void RelationSetNewRelfilenode(Relation relation, char persistence) { - Oid newrelfilenode; + RelNode newrelfilenode; Relation pg_class; HeapTuple tuple; Form_pg_class classform; @@ -3682,7 +3684,7 @@ RelationSetNewRelfilenode(Relation relation, char persistence) RelFileNode newrnode; /* Allocate a new relfilenode */ - newrelfilenode = GetNewRelFileNode(relation->rd_rel->reltablespace, NULL, + newrelfilenode = GetNewRelFileNode(relation->rd_rel->reltablespace, persistence); /* @@ -3711,7 +3713,8 @@ RelationSetNewRelfilenode(Relation relation, char persistence) * caught here, if GetNewRelFileNode messes up for any reason. */ newrnode = relation->rd_node; - newrnode.relNode = newrelfilenode; + RELFILENODE_SETRELNODE(newrnode, newrelfilenode); + if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind)) { diff --git a/src/backend/utils/cache/relfilenodemap.c b/src/backend/utils/cache/relfilenodemap.c index 70c323c..8c4e924 100644 --- a/src/backend/utils/cache/relfilenodemap.c +++ b/src/backend/utils/cache/relfilenodemap.c @@ -37,7 +37,7 @@ static ScanKeyData relfilenode_skey[2]; typedef struct { Oid reltablespace; - Oid relfilenode; + RelNodeId relfilenode; } RelfilenodeMapKey; typedef struct @@ -135,7 +135,7 @@ InitializeRelfilenodeMap(void) * Returns InvalidOid if no relation matching the criteria could be found. */ Oid -RelidByRelfilenode(Oid reltablespace, Oid relfilenode) +RelidByRelfilenode(Oid reltablespace, RelNode relfilenode) { RelfilenodeMapKey key; RelfilenodeMapEntry *entry; @@ -155,7 +155,7 @@ RelidByRelfilenode(Oid reltablespace, Oid relfilenode) MemSet(&key, 0, sizeof(key)); key.reltablespace = reltablespace; - key.relfilenode = relfilenode; + RELNODEID_SET_RELNODE(key.relfilenode, relfilenode); /* * Check cache and return entry if one is found. Even if no target @@ -196,7 +196,7 @@ RelidByRelfilenode(Oid reltablespace, Oid relfilenode) /* set scan arguments */ skey[0].sk_argument = ObjectIdGetDatum(reltablespace); - skey[1].sk_argument = ObjectIdGetDatum(relfilenode); + skey[1].sk_argument = Int64GetDatum(relfilenode); scandesc = systable_beginscan(relation, ClassTblspcRelfilenodeIndexId, @@ -213,7 +213,7 @@ RelidByRelfilenode(Oid reltablespace, Oid relfilenode) if (found) elog(ERROR, - "unexpected duplicate for tablespace %u, relfilenode %u", + "unexpected duplicate for tablespace %u, relfilenode" INT64_FORMAT, reltablespace, relfilenode); found = true; diff --git a/src/backend/utils/cache/relmapper.c b/src/backend/utils/cache/relmapper.c index 4f6811f..503dd19 100644 --- a/src/backend/utils/cache/relmapper.c +++ b/src/backend/utils/cache/relmapper.c @@ -79,7 +79,7 @@ typedef struct RelMapping { Oid mapoid; /* OID of a catalog */ - Oid mapfilenode; /* its filenode number */ + RelNodeId mapfilenode; /* its filenode number */ } RelMapping; typedef struct RelMapFile @@ -132,7 +132,7 @@ static RelMapFile pending_local_updates; /* non-export function prototypes */ -static void apply_map_update(RelMapFile *map, Oid relationId, Oid fileNode, +static void apply_map_update(RelMapFile *map, Oid relationId, RelNode fileNode, bool add_okay); static void merge_map_updates(RelMapFile *map, const RelMapFile *updates, bool add_okay); @@ -155,7 +155,7 @@ static void perform_relmap_update(bool shared, const RelMapFile *updates); * Returns InvalidOid if the OID is not known (which should never happen, * but the caller is in a better position to report a meaningful error). */ -Oid +RelNode RelationMapOidToFilenode(Oid relationId, bool shared) { const RelMapFile *map; @@ -168,13 +168,13 @@ RelationMapOidToFilenode(Oid relationId, bool shared) for (i = 0; i < map->num_mappings; i++) { if (relationId == map->mappings[i].mapoid) - return map->mappings[i].mapfilenode; + return RELNODEID_GET_RELNODE(map->mappings[i].mapfilenode); } map = &shared_map; for (i = 0; i < map->num_mappings; i++) { if (relationId == map->mappings[i].mapoid) - return map->mappings[i].mapfilenode; + return RELNODEID_GET_RELNODE(map->mappings[i].mapfilenode); } } else @@ -183,17 +183,17 @@ RelationMapOidToFilenode(Oid relationId, bool shared) for (i = 0; i < map->num_mappings; i++) { if (relationId == map->mappings[i].mapoid) - return map->mappings[i].mapfilenode; + return RELNODEID_GET_RELNODE(map->mappings[i].mapfilenode); } map = &local_map; for (i = 0; i < map->num_mappings; i++) { if (relationId == map->mappings[i].mapoid) - return map->mappings[i].mapfilenode; + return RELNODEID_GET_RELNODE(map->mappings[i].mapfilenode); } } - return InvalidOid; + return InvalidRelfileNode; } /* @@ -209,7 +209,7 @@ RelationMapOidToFilenode(Oid relationId, bool shared) * relfilenode doesn't pertain to a mapped relation. */ Oid -RelationMapFilenodeToOid(Oid filenode, bool shared) +RelationMapFilenodeToOid(RelNode filenode, bool shared) { const RelMapFile *map; int32 i; @@ -220,13 +220,13 @@ RelationMapFilenodeToOid(Oid filenode, bool shared) map = &active_shared_updates; for (i = 0; i < map->num_mappings; i++) { - if (filenode == map->mappings[i].mapfilenode) + if (filenode == RELNODEID_GET_RELNODE(map->mappings[i].mapfilenode)) return map->mappings[i].mapoid; } map = &shared_map; for (i = 0; i < map->num_mappings; i++) { - if (filenode == map->mappings[i].mapfilenode) + if (filenode == RELNODEID_GET_RELNODE(map->mappings[i].mapfilenode)) return map->mappings[i].mapoid; } } @@ -235,13 +235,13 @@ RelationMapFilenodeToOid(Oid filenode, bool shared) map = &active_local_updates; for (i = 0; i < map->num_mappings; i++) { - if (filenode == map->mappings[i].mapfilenode) + if (filenode == RELNODEID_GET_RELNODE(map->mappings[i].mapfilenode)) return map->mappings[i].mapoid; } map = &local_map; for (i = 0; i < map->num_mappings; i++) { - if (filenode == map->mappings[i].mapfilenode) + if (filenode == RELNODEID_GET_RELNODE(map->mappings[i].mapfilenode)) return map->mappings[i].mapoid; } } @@ -258,7 +258,7 @@ RelationMapFilenodeToOid(Oid filenode, bool shared) * immediately. Otherwise it is made pending until CommandCounterIncrement. */ void -RelationMapUpdateMap(Oid relationId, Oid fileNode, bool shared, +RelationMapUpdateMap(Oid relationId, RelNode fileNode, bool shared, bool immediate) { RelMapFile *map; @@ -316,7 +316,8 @@ RelationMapUpdateMap(Oid relationId, Oid fileNode, bool shared, * add_okay = false to draw an error if not. */ static void -apply_map_update(RelMapFile *map, Oid relationId, Oid fileNode, bool add_okay) +apply_map_update(RelMapFile *map, Oid relationId, RelNode fileNode, + bool add_okay) { int32 i; @@ -325,7 +326,7 @@ apply_map_update(RelMapFile *map, Oid relationId, Oid fileNode, bool add_okay) { if (relationId == map->mappings[i].mapoid) { - map->mappings[i].mapfilenode = fileNode; + RELNODEID_SET_RELNODE(map->mappings[i].mapfilenode, fileNode); return; } } @@ -337,7 +338,8 @@ apply_map_update(RelMapFile *map, Oid relationId, Oid fileNode, bool add_okay) if (map->num_mappings >= MAX_MAPPINGS) elog(ERROR, "ran out of space in relation map"); map->mappings[map->num_mappings].mapoid = relationId; - map->mappings[map->num_mappings].mapfilenode = fileNode; + RELNODEID_SET_RELNODE(map->mappings[map->num_mappings].mapfilenode, + fileNode); map->num_mappings++; } @@ -356,7 +358,8 @@ merge_map_updates(RelMapFile *map, const RelMapFile *updates, bool add_okay) { apply_map_update(map, updates->mappings[i].mapoid, - updates->mappings[i].mapfilenode, + RELNODEID_GET_RELNODE( + updates->mappings[i].mapfilenode), add_okay); } } diff --git a/src/backend/utils/misc/pg_controldata.c b/src/backend/utils/misc/pg_controldata.c index 781f8b8..85ed88c 100644 --- a/src/backend/utils/misc/pg_controldata.c +++ b/src/backend/utils/misc/pg_controldata.c @@ -79,8 +79,8 @@ pg_control_system(PG_FUNCTION_ARGS) Datum pg_control_checkpoint(PG_FUNCTION_ARGS) { - Datum values[18]; - bool nulls[18]; + Datum values[19]; + bool nulls[19]; TupleDesc tupdesc; HeapTuple htup; ControlFileData *ControlFile; @@ -129,6 +129,8 @@ pg_control_checkpoint(PG_FUNCTION_ARGS) XIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 18, "checkpoint_time", TIMESTAMPTZOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 19, "next_relfilenode", + INT8OID, -1, 0); tupdesc = BlessTupleDesc(tupdesc); /* Read the control file. */ @@ -202,6 +204,9 @@ pg_control_checkpoint(PG_FUNCTION_ARGS) values[17] = TimestampTzGetDatum(time_t_to_timestamptz(ControlFile->checkPointCopy.time)); nulls[17] = false; + values[18] = Int64GetDatum(ControlFile->checkPointCopy.nextRelNode); + nulls[18] = false; + htup = heap_form_tuple(tupdesc, values, nulls); PG_RETURN_DATUM(HeapTupleGetDatum(htup)); diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c index 7e69475..94ec594 100644 --- a/src/bin/pg_checksums/pg_checksums.c +++ b/src/bin/pg_checksums/pg_checksums.c @@ -520,9 +520,9 @@ main(int argc, char *argv[]) mode = PG_MODE_ENABLE; break; case 'f': - if (!option_parse_int(optarg, "-f/--filenode", 0, - INT_MAX, - NULL)) + if (!option_parse_int64(optarg, "-f/--filenode", 0, + LLONG_MAX, + NULL)) exit(1); only_filenode = pstrdup(optarg); break; diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index f911f98..2513fc3 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -250,6 +250,8 @@ main(int argc, char *argv[]) printf(_("Latest checkpoint's NextXID: %u:%u\n"), EpochFromFullTransactionId(ControlFile->checkPointCopy.nextXid), XidFromFullTransactionId(ControlFile->checkPointCopy.nextXid)); + printf(_("Latest checkpoint's NextRelFileNode: " INT64_FORMAT "\n"), + ControlFile->checkPointCopy.nextRelNode); printf(_("Latest checkpoint's NextOID: %u\n"), ControlFile->checkPointCopy.nextOid); printf(_("Latest checkpoint's NextMultiXactId: %u\n"), diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index e3ddf19..2f3968d 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -4637,12 +4637,12 @@ binary_upgrade_set_pg_class_oids(Archive *fout, { PQExpBuffer upgrade_query = createPQExpBuffer(); PGresult *upgrade_res; - Oid relfilenode; + RelNode relfilenode; Oid toast_oid; - Oid toast_relfilenode; + RelNode toast_relfilenode; char relkind; Oid toast_index_oid; - Oid toast_index_relfilenode; + RelNode toast_index_relfilenode; /* * Preserve the OID and relfilenode of the table, table's index, table's @@ -4668,11 +4668,11 @@ binary_upgrade_set_pg_class_oids(Archive *fout, relkind = *PQgetvalue(upgrade_res, 0, PQfnumber(upgrade_res, "relkind")); - relfilenode = atooid(PQgetvalue(upgrade_res, 0, + relfilenode = atorelnode(PQgetvalue(upgrade_res, 0, PQfnumber(upgrade_res, "relfilenode"))); toast_oid = atooid(PQgetvalue(upgrade_res, 0, PQfnumber(upgrade_res, "reltoastrelid"))); - toast_relfilenode = atooid(PQgetvalue(upgrade_res, 0, + toast_relfilenode = atorelnode(PQgetvalue(upgrade_res, 0, PQfnumber(upgrade_res, "toast_relfilenode"))); toast_index_oid = atooid(PQgetvalue(upgrade_res, 0, PQfnumber(upgrade_res, "indexrelid"))); @@ -4693,9 +4693,9 @@ binary_upgrade_set_pg_class_oids(Archive *fout, * partitioned tables have a relfilenode, which should not be preserved * when upgrading. */ - if (OidIsValid(relfilenode) && relkind != RELKIND_PARTITIONED_TABLE) + if (RelfileNodeIsValid(relfilenode) && relkind != RELKIND_PARTITIONED_TABLE) appendPQExpBuffer(upgrade_buffer, - "SELECT pg_catalog.binary_upgrade_set_next_heap_relfilenode('%u'::pg_catalog.oid);\n", + "SELECT pg_catalog.binary_upgrade_set_next_heap_relfilenode('" INT64_FORMAT "'::pg_catalog.int8);\n", relfilenode); /* @@ -4709,7 +4709,7 @@ binary_upgrade_set_pg_class_oids(Archive *fout, "SELECT pg_catalog.binary_upgrade_set_next_toast_pg_class_oid('%u'::pg_catalog.oid);\n", toast_oid); appendPQExpBuffer(upgrade_buffer, - "SELECT pg_catalog.binary_upgrade_set_next_toast_relfilenode('%u'::pg_catalog.oid);\n", + "SELECT pg_catalog.binary_upgrade_set_next_toast_relfilenode('" INT64_FORMAT "'::pg_catalog.int8);\n", toast_relfilenode); /* every toast table has an index */ @@ -4717,7 +4717,7 @@ binary_upgrade_set_pg_class_oids(Archive *fout, "SELECT pg_catalog.binary_upgrade_set_next_index_pg_class_oid('%u'::pg_catalog.oid);\n", toast_index_oid); appendPQExpBuffer(upgrade_buffer, - "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('%u'::pg_catalog.oid);\n", + "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('" INT64_FORMAT "'::pg_catalog.int8);\n", toast_index_relfilenode); } @@ -4730,7 +4730,7 @@ binary_upgrade_set_pg_class_oids(Archive *fout, "SELECT pg_catalog.binary_upgrade_set_next_index_pg_class_oid('%u'::pg_catalog.oid);\n", pg_class_oid); appendPQExpBuffer(upgrade_buffer, - "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('%u'::pg_catalog.oid);\n", + "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('" INT64_FORMAT "'::pg_catalog.oid);\n", relfilenode); } diff --git a/src/bin/pg_rewind/filemap.c b/src/bin/pg_rewind/filemap.c index 7211090..06d3445 100644 --- a/src/bin/pg_rewind/filemap.c +++ b/src/bin/pg_rewind/filemap.c @@ -512,6 +512,7 @@ isRelDataFile(const char *path) RelFileNode rnode; unsigned int segNo; int nmatch; + RelNode relNode; bool matched; /*---- @@ -535,11 +536,12 @@ isRelDataFile(const char *path) */ rnode.spcNode = InvalidOid; rnode.dbNode = InvalidOid; - rnode.relNode = InvalidOid; + RELFILENODE_SETRELNODE(rnode, 0); /* FIXME-1 */ segNo = 0; matched = false; - nmatch = sscanf(path, "global/%u.%u", &rnode.relNode, &segNo); + nmatch = sscanf(path, "global/" INT64_FORMAT ".%u", &relNode, &segNo); + RELFILENODE_SETRELNODE(rnode, relNode); if (nmatch == 1 || nmatch == 2) { rnode.spcNode = GLOBALTABLESPACE_OID; @@ -548,8 +550,9 @@ isRelDataFile(const char *path) } else { - nmatch = sscanf(path, "base/%u/%u.%u", - &rnode.dbNode, &rnode.relNode, &segNo); + nmatch = sscanf(path, "base/%u/" INT64_FORMAT ".%u", + &rnode.dbNode, &relNode, &segNo); + RELFILENODE_SETRELNODE(rnode, relNode); if (nmatch == 2 || nmatch == 3) { rnode.spcNode = DEFAULTTABLESPACE_OID; @@ -557,9 +560,10 @@ isRelDataFile(const char *path) } else { - nmatch = sscanf(path, "pg_tblspc/%u/" TABLESPACE_VERSION_DIRECTORY "/%u/%u.%u", - &rnode.spcNode, &rnode.dbNode, &rnode.relNode, + nmatch = sscanf(path, "pg_tblspc/%u/" TABLESPACE_VERSION_DIRECTORY "/%u/" INT64_FORMAT ".%u", + &rnode.spcNode, &rnode.dbNode, &relNode, &segNo); + RELFILENODE_SETRELNODE(rnode, relNode); if (nmatch == 3 || nmatch == 4) matched = true; } diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c index 69ef231..d3c5d53 100644 --- a/src/bin/pg_upgrade/info.c +++ b/src/bin/pg_upgrade/info.c @@ -383,8 +383,8 @@ get_rel_infos(ClusterInfo *cluster, DbInfo *dbinfo) i_reloid, i_indtable, i_toastheap, - i_relfilenode, i_reltablespace; + RelNode i_relfilenode; char query[QUERY_ALLOC]; char *last_namespace = NULL, *last_tablespace = NULL; @@ -511,7 +511,7 @@ get_rel_infos(ClusterInfo *cluster, DbInfo *dbinfo) relname = PQgetvalue(res, relnum, i_relname); curr->relname = pg_strdup(relname); - curr->relfilenode = atooid(PQgetvalue(res, relnum, i_relfilenode)); + curr->relfilenode = atorelnode(PQgetvalue(res, relnum, i_relfilenode)); curr->tblsp_alloc = false; /* Is the tablespace oid non-default? */ diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index 1db8e3f..a3503aa 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -122,7 +122,7 @@ typedef struct char *nspname; /* namespace name */ char *relname; /* relation name */ Oid reloid; /* relation OID */ - Oid relfilenode; /* relation file node */ + RelNode relfilenode; /* relation file node */ Oid indtable; /* if index, OID of its table, else 0 */ Oid toastheap; /* if toast table, OID of base table, else 0 */ char *tablespace; /* tablespace path; "" for cluster default */ @@ -146,7 +146,7 @@ typedef struct const char *old_tablespace_suffix; const char *new_tablespace_suffix; Oid db_oid; - Oid relfilenode; + RelNode relfilenode; /* the rest are used only for logging and error reporting */ char *nspname; /* namespaces */ char *relname; diff --git a/src/bin/pg_upgrade/relfilenode.c b/src/bin/pg_upgrade/relfilenode.c index 2f4deb3..10e6a6c 100644 --- a/src/bin/pg_upgrade/relfilenode.c +++ b/src/bin/pg_upgrade/relfilenode.c @@ -190,14 +190,14 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro else snprintf(extent_suffix, sizeof(extent_suffix), ".%d", segno); - snprintf(old_file, sizeof(old_file), "%s%s/%u/%u%s%s", + snprintf(old_file, sizeof(old_file), "%s%s/%u/" INT64_FORMAT "%s%s", map->old_tablespace, map->old_tablespace_suffix, map->db_oid, map->relfilenode, type_suffix, extent_suffix); - snprintf(new_file, sizeof(new_file), "%s%s/%u/%u%s%s", + snprintf(new_file, sizeof(new_file), "%s%s/%u/" INT64_FORMAT "%s%s", map->new_tablespace, map->new_tablespace_suffix, map->db_oid, diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c index a6251e1..ae4a2d8 100644 --- a/src/bin/pg_waldump/pg_waldump.c +++ b/src/bin/pg_waldump/pg_waldump.c @@ -518,15 +518,17 @@ XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record) XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk); if (forknum != MAIN_FORKNUM) - printf(", blkref #%d: rel %u/%u/%u fork %s blk %u", + printf(", blkref #%d: rel %u/%u/" INT64_FORMAT " fork %s blk %u", block_id, - rnode.spcNode, rnode.dbNode, rnode.relNode, + rnode.spcNode, rnode.dbNode, + RELFILENODE_GETRELNODE(rnode), forkNames[forknum], blk); else - printf(", blkref #%d: rel %u/%u/%u blk %u", + printf(", blkref #%d: rel %u/%u/" INT64_FORMAT "blk %u", block_id, - rnode.spcNode, rnode.dbNode, rnode.relNode, + rnode.spcNode, rnode.dbNode, + RELFILENODE_GETRELNODE(rnode), blk); if (XLogRecHasBlockImage(record, block_id)) { @@ -548,9 +550,9 @@ XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record) continue; XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk); - printf("\tblkref #%d: rel %u/%u/%u fork %s blk %u", + printf("\tblkref #%d: rel %u/%u/" INT64_FORMAT " fork %s blk %u", block_id, - rnode.spcNode, rnode.dbNode, rnode.relNode, + rnode.spcNode, rnode.dbNode, RELFILENODE_GETRELNODE(rnode), forkNames[forknum], blk); if (XLogRecHasBlockImage(record, block_id)) diff --git a/src/common/relpath.c b/src/common/relpath.c index 636c96e..27b8547 100644 --- a/src/common/relpath.c +++ b/src/common/relpath.c @@ -138,7 +138,7 @@ GetDatabasePath(Oid dbNode, Oid spcNode) * the trouble considering BackendId is just int anyway. */ char * -GetRelationPath(Oid dbNode, Oid spcNode, Oid relNode, +GetRelationPath(Oid dbNode, Oid spcNode, RelNode relNode, int backendId, ForkNumber forkNumber) { char *path; @@ -149,10 +149,10 @@ GetRelationPath(Oid dbNode, Oid spcNode, Oid relNode, Assert(dbNode == 0); Assert(backendId == InvalidBackendId); if (forkNumber != MAIN_FORKNUM) - path = psprintf("global/%u_%s", + path = psprintf("global/" INT64_FORMAT "_%s", relNode, forkNames[forkNumber]); else - path = psprintf("global/%u", relNode); + path = psprintf("global/" INT64_FORMAT, relNode); } else if (spcNode == DEFAULTTABLESPACE_OID) { @@ -160,21 +160,21 @@ GetRelationPath(Oid dbNode, Oid spcNode, Oid relNode, if (backendId == InvalidBackendId) { if (forkNumber != MAIN_FORKNUM) - path = psprintf("base/%u/%u_%s", + path = psprintf("base/%u/" INT64_FORMAT "_%s", dbNode, relNode, forkNames[forkNumber]); else - path = psprintf("base/%u/%u", + path = psprintf("base/%u/" INT64_FORMAT, dbNode, relNode); } else { if (forkNumber != MAIN_FORKNUM) - path = psprintf("base/%u/t%d_%u_%s", + path = psprintf("base/%u/t%d_" INT64_FORMAT "_%s", dbNode, backendId, relNode, forkNames[forkNumber]); else - path = psprintf("base/%u/t%d_%u", + path = psprintf("base/%u/t%d_" INT64_FORMAT, dbNode, backendId, relNode); } } @@ -184,24 +184,24 @@ GetRelationPath(Oid dbNode, Oid spcNode, Oid relNode, if (backendId == InvalidBackendId) { if (forkNumber != MAIN_FORKNUM) - path = psprintf("pg_tblspc/%u/%s/%u/%u_%s", + path = psprintf("pg_tblspc/%u/%s/%u/" INT64_FORMAT "_%s", spcNode, TABLESPACE_VERSION_DIRECTORY, dbNode, relNode, forkNames[forkNumber]); else - path = psprintf("pg_tblspc/%u/%s/%u/%u", + path = psprintf("pg_tblspc/%u/%s/%u/" INT64_FORMAT, spcNode, TABLESPACE_VERSION_DIRECTORY, dbNode, relNode); } else { if (forkNumber != MAIN_FORKNUM) - path = psprintf("pg_tblspc/%u/%s/%u/t%d_%u_%s", + path = psprintf("pg_tblspc/%u/%s/%u/t%d_" INT64_FORMAT "_%s", spcNode, TABLESPACE_VERSION_DIRECTORY, dbNode, backendId, relNode, forkNames[forkNumber]); else - path = psprintf("pg_tblspc/%u/%s/%u/t%d_%u", + path = psprintf("pg_tblspc/%u/%s/%u/t%d_" INT64_FORMAT, spcNode, TABLESPACE_VERSION_DIRECTORY, dbNode, backendId, relNode); } diff --git a/src/fe_utils/option_utils.c b/src/fe_utils/option_utils.c index abea881..2cb3370 100644 --- a/src/fe_utils/option_utils.c +++ b/src/fe_utils/option_utils.c @@ -82,3 +82,45 @@ option_parse_int(const char *optarg, const char *optname, *result = val; return true; } + +/* + * option_parse_int64 + * + * Same as option_parse_int but parse int64. + */ +bool +option_parse_int64(const char *optarg, const char *optname, + int64 min_range, int64 max_range, + int64 *result) +{ + char *endptr; + int64 val; + + errno = 0; + val = strtoi64(optarg, &endptr, 10); + + /* + * Skip any trailing whitespace; if anything but whitespace remains before + * the terminating character, fail. + */ + while (*endptr != '\0' && isspace((unsigned char) *endptr)) + endptr++; + + if (*endptr != '\0') + { + pg_log_error("invalid value \"%s\" for option %s", + optarg, optname); + return false; + } + + if (errno == ERANGE || val < min_range || val > max_range) + { + pg_log_error("%s must be in range " INT64_FORMAT ".." INT64_FORMAT, + optname, min_range, max_range); + return false; + } + + if (result) + *result = val; + return true; +} diff --git a/src/include/access/transam.h b/src/include/access/transam.h index 9a2816d..8113335 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -217,6 +217,9 @@ typedef struct VariableCacheData */ Oid nextOid; /* next OID to assign */ uint32 oidCount; /* OIDs available before must do XLOG work */ + RelNode nextRelNode; /* next relfilenode to assign */ + uint32 relnodecount; /* Relfilenode available before must do XLOG + work */ /* * These fields are protected by XidGenLock. @@ -298,6 +301,7 @@ extern void SetTransactionIdLimit(TransactionId oldest_datfrozenxid, extern void AdvanceOldestClogXid(TransactionId oldest_datfrozenxid); extern bool ForceTransactionIdLimitUpdate(void); extern Oid GetNewObjectId(void); +extern RelNode GetNewRelNode(void); extern void StopGeneratingPinnedObjectIds(void); #ifdef USE_ASSERT_CHECKING diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index bb0c526..04f0cd6 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -304,6 +304,7 @@ extern bool CreateRestartPoint(int flags); extern WALAvailability GetWALAvailability(XLogRecPtr targetLSN); extern XLogRecPtr CalculateMaxmumSafeLSN(void); extern void XLogPutNextOid(Oid nextOid); +extern void XLogPutNextRelFileNode(RelNode nextrelnode); extern XLogRecPtr XLogRestorePoint(const char *rpName); extern void UpdateFullPageWrites(void); extern void GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p); diff --git a/src/include/catalog/binary_upgrade.h b/src/include/catalog/binary_upgrade.h index 0b6944b..d2b45ba 100644 --- a/src/include/catalog/binary_upgrade.h +++ b/src/include/catalog/binary_upgrade.h @@ -26,7 +26,7 @@ extern PGDLLIMPORT Oid binary_upgrade_next_heap_pg_class_relfilenode; extern PGDLLIMPORT Oid binary_upgrade_next_index_pg_class_oid; extern PGDLLIMPORT Oid binary_upgrade_next_index_pg_class_relfilenode; extern PGDLLIMPORT Oid binary_upgrade_next_toast_pg_class_oid; -extern PGDLLIMPORT Oid binary_upgrade_next_toast_pg_class_relfilenode; +extern PGDLLIMPORT RelNode binary_upgrade_next_toast_pg_class_relfilenode; extern PGDLLIMPORT Oid binary_upgrade_next_pg_enum_oid; extern PGDLLIMPORT Oid binary_upgrade_next_pg_authid_oid; diff --git a/src/include/catalog/catalog.h b/src/include/catalog/catalog.h index 60c1215..1b83c79 100644 --- a/src/include/catalog/catalog.h +++ b/src/include/catalog/catalog.h @@ -15,6 +15,7 @@ #define CATALOG_H #include "catalog/pg_class.h" +#include "storage/relfilenode.h" #include "utils/relcache.h" @@ -38,7 +39,6 @@ extern bool IsPinnedObject(Oid classId, Oid objectId); extern Oid GetNewOidWithIndex(Relation relation, Oid indexId, AttrNumber oidcolumn); -extern Oid GetNewRelFileNode(Oid reltablespace, Relation pg_class, - char relpersistence); +extern RelNode GetNewRelFileNode(Oid reltablespace, char relpersistence); #endif /* CATALOG_H */ diff --git a/src/include/catalog/heap.h b/src/include/catalog/heap.h index c4757bd..66d41af 100644 --- a/src/include/catalog/heap.h +++ b/src/include/catalog/heap.h @@ -50,7 +50,7 @@ extern Relation heap_create(const char *relname, Oid relnamespace, Oid reltablespace, Oid relid, - Oid relfilenode, + RelNode relfilenode, Oid accessmtd, TupleDesc tupDesc, char relkind, diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index a1d6e3b..1e79ec9 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -71,7 +71,7 @@ extern Oid index_create(Relation heapRelation, Oid indexRelationId, Oid parentIndexRelid, Oid parentConstraintId, - Oid relFileNode, + RelNode relFileNode, IndexInfo *indexInfo, List *indexColNames, Oid accessMethodObjectId, diff --git a/src/include/catalog/pg_class.h b/src/include/catalog/pg_class.h index 304e8c1..4659ed3 100644 --- a/src/include/catalog/pg_class.h +++ b/src/include/catalog/pg_class.h @@ -52,13 +52,13 @@ CATALOG(pg_class,1259,RelationRelationId) BKI_BOOTSTRAP BKI_ROWTYPE_OID(83,Relat /* access method; 0 if not a table / index */ Oid relam BKI_DEFAULT(heap) BKI_LOOKUP_OPT(pg_am); - /* identifier of physical storage file */ - /* relfilenode == 0 means it is a "mapped" relation, see relmapper.c */ - Oid relfilenode BKI_DEFAULT(0); - /* identifier of table space for relation (0 means default for database) */ Oid reltablespace BKI_DEFAULT(0) BKI_LOOKUP_OPT(pg_tablespace); + /* identifier of physical storage file */ + /* relfilenode == 0 means it is a "mapped" relation, see relmapper.c */ + int64 relfilenode BKI_DEFAULT(0); + /* # of blocks (not always up-to-date) */ int32 relpages BKI_DEFAULT(0); @@ -154,7 +154,7 @@ typedef FormData_pg_class *Form_pg_class; DECLARE_UNIQUE_INDEX_PKEY(pg_class_oid_index, 2662, ClassOidIndexId, on pg_class using btree(oid oid_ops)); DECLARE_UNIQUE_INDEX(pg_class_relname_nsp_index, 2663, ClassNameNspIndexId, on pg_class using btree(relname name_ops, relnamespace oid_ops)); -DECLARE_INDEX(pg_class_tblspc_relfilenode_index, 3455, ClassTblspcRelfilenodeIndexId, on pg_class using btree(reltablespace oid_ops, relfilenode oid_ops)); +DECLARE_INDEX(pg_class_tblspc_relfilenode_index, 3455, ClassTblspcRelfilenodeIndexId, on pg_class using btree(reltablespace oid_ops, relfilenode int8_ops)); #ifdef EXPOSE_TO_CLIENT_CODE diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index 1f3dc24..27d584d 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -41,6 +41,7 @@ typedef struct CheckPoint * timeline (equals ThisTimeLineID otherwise) */ bool fullPageWrites; /* current full_page_writes */ FullTransactionId nextXid; /* next free transaction ID */ + RelNode nextRelNode; /* next relfile node */ Oid nextOid; /* next free OID */ MultiXactId nextMulti; /* next free MultiXactId */ MultiXactOffset nextMultiOffset; /* next free MultiXact offset */ @@ -78,6 +79,7 @@ typedef struct CheckPoint #define XLOG_FPI 0xB0 /* 0xC0 is used in Postgres 9.5-11 */ #define XLOG_OVERWRITE_CONTRECORD 0xD0 +#define XLOG_NEXT_RELFILENODE 0xE0 /* diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 0859dc8..bc21fdc 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -7270,11 +7270,11 @@ proname => 'pg_indexes_size', provolatile => 'v', prorettype => 'int8', proargtypes => 'regclass', prosrc => 'pg_indexes_size' }, { oid => '2999', descr => 'filenode identifier of relation', - proname => 'pg_relation_filenode', provolatile => 's', prorettype => 'oid', + proname => 'pg_relation_filenode', provolatile => 's', prorettype => 'int8', proargtypes => 'regclass', prosrc => 'pg_relation_filenode' }, { oid => '3454', descr => 'relation OID for filenode and tablespace', proname => 'pg_filenode_relation', provolatile => 's', - prorettype => 'regclass', proargtypes => 'oid oid', + prorettype => 'regclass', proargtypes => 'oid int8', prosrc => 'pg_filenode_relation' }, { oid => '3034', descr => 'file path of relation', proname => 'pg_relation_filepath', provolatile => 's', prorettype => 'text', @@ -11050,7 +11050,7 @@ prosrc => 'binary_upgrade_set_next_index_relfilenode' }, { oid => '4547', descr => 'for use by pg_upgrade', proname => 'binary_upgrade_set_next_toast_relfilenode', provolatile => 'v', - proparallel => 'u', prorettype => 'void', proargtypes => 'oid', + proparallel => 'u', prorettype => 'void', proargtypes => 'int8', prosrc => 'binary_upgrade_set_next_toast_relfilenode' }, { oid => '4548', descr => 'for use by pg_upgrade', proname => 'binary_upgrade_set_next_pg_tablespace_oid', provolatile => 'v', diff --git a/src/include/commands/tablecmds.h b/src/include/commands/tablecmds.h index 5d4037f..297c20b 100644 --- a/src/include/commands/tablecmds.h +++ b/src/include/commands/tablecmds.h @@ -66,7 +66,7 @@ extern void SetRelationHasSubclass(Oid relationId, bool relhassubclass); extern bool CheckRelationTableSpaceMove(Relation rel, Oid newTableSpaceId); extern void SetRelationTableSpace(Relation rel, Oid newTableSpaceId, - Oid newRelFileNode); + RelNode newRelFileNode); extern ObjectAddress renameatt(RenameStmt *stmt); diff --git a/src/include/common/relpath.h b/src/include/common/relpath.h index a4b5dc8..52f06a5 100644 --- a/src/include/common/relpath.h +++ b/src/include/common/relpath.h @@ -66,7 +66,7 @@ extern int forkname_chars(const char *str, ForkNumber *fork); */ extern char *GetDatabasePath(Oid dbNode, Oid spcNode); -extern char *GetRelationPath(Oid dbNode, Oid spcNode, Oid relNode, +extern char *GetRelationPath(Oid dbNode, Oid spcNode, RelNode relNode, int backendId, ForkNumber forkNumber); /* @@ -76,8 +76,8 @@ extern char *GetRelationPath(Oid dbNode, Oid spcNode, Oid relNode, /* First argument is a RelFileNode */ #define relpathbackend(rnode, backend, forknum) \ - GetRelationPath((rnode).dbNode, (rnode).spcNode, (rnode).relNode, \ - backend, forknum) + GetRelationPath((rnode).dbNode, (rnode).spcNode, \ + RELFILENODE_GETRELNODE((rnode)), backend, forknum) /* First argument is a RelFileNode */ #define relpathperm(rnode, forknum) \ diff --git a/src/include/fe_utils/option_utils.h b/src/include/fe_utils/option_utils.h index 03c09fd..8c0e818 100644 --- a/src/include/fe_utils/option_utils.h +++ b/src/include/fe_utils/option_utils.h @@ -22,5 +22,8 @@ extern void handle_help_version_opts(int argc, char *argv[], extern bool option_parse_int(const char *optarg, const char *optname, int min_range, int max_range, int *result); +extern bool option_parse_int64(const char *optarg, const char *optname, + int64 min_range, int64 max_range, + int64 *result); #endif /* OPTION_UTILS_H */ diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 3e9bdc7..ab0648a 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -2900,7 +2900,7 @@ typedef struct IndexStmt List *excludeOpNames; /* exclusion operator names, or NIL if none */ char *idxcomment; /* comment to apply to index, or NULL */ Oid indexOid; /* OID of an existing index, if any */ - Oid oldNode; /* relfilenode of existing storage, if any */ + RelNode oldNode; /* relfilenode of existing storage, if any */ SubTransactionId oldCreateSubid; /* rd_createSubid of oldNode */ SubTransactionId oldFirstRelfilenodeSubid; /* rd_firstRelfilenodeSubid of * oldNode */ diff --git a/src/include/postgres_ext.h b/src/include/postgres_ext.h index fdb61b7..bd907f7 100644 --- a/src/include/postgres_ext.h +++ b/src/include/postgres_ext.h @@ -46,6 +46,21 @@ typedef unsigned int Oid; /* Define a signed 64-bit integer type for use in client API declarations. */ typedef PG_INT64_TYPE pg_int64; +/* + * RelNode data type identifies the specific relation file name. RelNode is + * unique within a cluster. + * + * XXX idealy we can use uint64 but current we only have int8 as an exposed + * datatype so maybe we should make a new datatype relnode which will be of + * type 8 bytes unsigned integer. + */ +typedef pg_int64 RelNode; + +#define atorelnode(x) ((RelNode) strtoul((x), NULL, 10)) + +#define InvalidRelfileNode ((RelNode) 0) +#define FirstNormalRelfileNode ((RelNode) 1) +#define RelfileNodeIsValid(relNode) ((bool) ((relNode) != InvalidRelfileNode)) /* * Identifiers of error message fields. Kept here to keep common diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index b903d2b..293dc90 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -21,6 +21,7 @@ #include "storage/condition_variable.h" #include "storage/latch.h" #include "storage/lwlock.h" +#include "storage/relfilenode.h" #include "storage/shmem.h" #include "storage/smgr.h" #include "storage/spin.h" @@ -91,7 +92,6 @@ typedef struct buftag { RelFileNode rnode; /* physical relation identifier */ - ForkNumber forkNum; BlockNumber blockNum; /* blknum relative to begin of reln */ } BufferTag; @@ -99,23 +99,23 @@ typedef struct buftag ( \ (a).rnode.spcNode = InvalidOid, \ (a).rnode.dbNode = InvalidOid, \ - (a).rnode.relNode = InvalidOid, \ - (a).forkNum = InvalidForkNumber, \ + RELFILENODE_SETRELNODE((a).rnode, 0), \ + RELFILENODE_SETFORKNUM((a).rnode, InvalidForkNumber), \ (a).blockNum = InvalidBlockNumber \ ) #define INIT_BUFFERTAG(a,xx_rnode,xx_forkNum,xx_blockNum) \ ( \ (a).rnode = (xx_rnode), \ - (a).forkNum = (xx_forkNum), \ - (a).blockNum = (xx_blockNum) \ + (a).blockNum = (xx_blockNum), \ + RELFILENODE_SETFORKNUM((a).rnode, (xx_forkNum)) \ ) #define BUFFERTAGS_EQUAL(a,b) \ ( \ RelFileNodeEquals((a).rnode, (b).rnode) && \ (a).blockNum == (b).blockNum && \ - (a).forkNum == (b).forkNum \ + RELFILENODE_GETFORKNUM((a).rnode) == RELFILENODE_GETFORKNUM((b).rnode) \ ) /* diff --git a/src/include/storage/relfilenode.h b/src/include/storage/relfilenode.h index 4fdc606..5474311 100644 --- a/src/include/storage/relfilenode.h +++ b/src/include/storage/relfilenode.h @@ -18,6 +18,18 @@ #include "storage/backendid.h" /* + * RelNodeId: + * + * this is a storage type for RelNode. The reasoning behind using this is same + * as using the BlockId so refer comment atop BlockId. + */ +typedef struct RelNodeId +{ + uint32 rn_hi; + uint32 rn_lo; +} RelNodeId; + +/* * RelFileNode must provide all that we need to know to physically access * a relation, with the exception of the backend ID, which can be provided * separately. Note, however, that a "physical" relation is comprised of @@ -31,11 +43,14 @@ * "shared" relations (those common to all databases of a cluster). * Nonzero dbNode values correspond to pg_database.oid. * - * relNode identifies the specific relation. relNode corresponds to - * pg_class.relfilenode (NOT pg_class.oid, because we need to be able - * to assign new physical files to relations in some situations). - * Notice that relNode is only unique within a database in a particular - * tablespace. + * relNode identifies the specific relation and its fork number. High 8 bits + * represent the fork number and the remaining 56 bits represent the + * relation. relNode corresponds to pg_class.relfilenode (NOT pg_class.oid). + * Notice that relNode is unique within a cluster. + * + * Note: When RelFileNode is part of the BufferTag only then the first 8 bits + * of the relNode will represent the fork number otherwise those will be + * cleared. * * Note: spcNode must be GLOBALTABLESPACE_OID if and only if dbNode is * zero. We support shared relations only in the "global" tablespace. @@ -53,12 +68,14 @@ * Note: various places use RelFileNode in hashtable keys. Therefore, * there *must not* be any unused padding bytes in this struct. That * should be safe as long as all the fields are of type Oid. + * + * We use RelNodeId in order to avoid the alignment padding. */ typedef struct RelFileNode { Oid spcNode; /* tablespace */ Oid dbNode; /* database */ - Oid relNode; /* relation */ + RelNodeId relNode; /* relation */ } RelFileNode; /* @@ -86,14 +103,51 @@ typedef struct RelFileNodeBackend * RelFileNodeBackendEquals. */ #define RelFileNodeEquals(node1, node2) \ - ((node1).relNode == (node2).relNode && \ + ((RELFILENODE_GETRELNODE((node1)) == RELFILENODE_GETRELNODE((node2))) && \ (node1).dbNode == (node2).dbNode && \ (node1).spcNode == (node2).spcNode) #define RelFileNodeBackendEquals(node1, node2) \ - ((node1).node.relNode == (node2).node.relNode && \ + (RELFILENODE_GETRELNODE((node1)) == RELFILENODE_GETRELNODE((node2)) && \ (node1).node.dbNode == (node2).node.dbNode && \ (node1).backend == (node2).backend && \ (node1).node.spcNode == (node2).node.spcNode) +/* + * These macros define the "relation" stored in the RelFileNode.relNode. Its + * remaining 8 high-order bits identify the relation's fork number. + */ +#define RELFILENODE_RELNODE_BITS 56 +#define RELFILENODE_RELNODE_MASK ((((RelNode) 1) << RELFILENODE_RELNODE_BITS) - 1) +#define MAX_RELFILENODE RELFILENODE_RELNODE_MASK + +/* Retrieve the RelNode from a RelNodeId. */ +#define RELNODEID_GET_RELNODE(rnodeid) \ + (RelNode) (((RelNode) (rnodeid).rn_hi << 32) | ((uint32) (rnodeid).rn_lo)) + +/* Store the given value in RelNodeId. */ +#define RELNODEID_SET_RELNODE(rnodeid, val) \ +( \ + (rnodeid).rn_hi = (val) >> 32, \ + (rnodeid).rn_lo = (val) & 0xffffffff \ +) + +/* Gets the relfilenode stored in rnode.relNode. */ +#define RELFILENODE_GETRELNODE(rnode) \ + (RELNODEID_GET_RELNODE((rnode).relNode) & RELFILENODE_RELNODE_MASK) + +/* Gets the fork number stored in rnode.relNode. */ +#define RELFILENODE_GETFORKNUM(rnode) \ + (RELNODEID_GET_RELNODE((rnode).relNode) >> RELFILENODE_RELNODE_BITS) + +/* Sets input val in the relfilenode part of the rnode.relNode. */ +#define RELFILENODE_SETRELNODE(rnode, val) \ + RELNODEID_SET_RELNODE((rnode).relNode, (val) & RELFILENODE_RELNODE_MASK) + +/* Sets input val in the fork number part of the rnode.relNode. */ +#define RELFILENODE_SETFORKNUM(rnode, val) \ + RELNODEID_SET_RELNODE((rnode).relNode, \ + (RELNODEID_GET_RELNODE((rnode).relNode)) | \ + ((RelNode) (val) << RELFILENODE_RELNODE_BITS)) + #endif /* RELFILENODE_H */ diff --git a/src/include/storage/sync.h b/src/include/storage/sync.h index 9737e1e..4d67850 100644 --- a/src/include/storage/sync.h +++ b/src/include/storage/sync.h @@ -57,7 +57,6 @@ typedef struct FileTag extern void InitSync(void); extern void SyncPreCheckpoint(void); -extern void SyncPostCheckpoint(void); extern void ProcessSyncRequests(void); extern void RememberSyncRequest(const FileTag *ftag, SyncRequestType type); extern bool RegisterSyncRequest(const FileTag *ftag, SyncRequestType type, diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index 6da1b22..d799b71 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -526,7 +526,7 @@ typedef struct ViewOptions */ #define RelationIsMapped(relation) \ (RELKIND_HAS_STORAGE((relation)->rd_rel->relkind) && \ - ((relation)->rd_rel->relfilenode == InvalidOid)) + ((relation)->rd_rel->relfilenode == InvalidRelfileNode)) /* * RelationGetSmgr diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h index 84d6afe..5d13660 100644 --- a/src/include/utils/relcache.h +++ b/src/include/utils/relcache.h @@ -102,7 +102,7 @@ extern Relation RelationBuildLocalRelation(const char *relname, TupleDesc tupDesc, Oid relid, Oid accessmtd, - Oid relfilenode, + RelNode relfilenode, Oid reltablespace, bool shared_relation, bool mapped_relation, diff --git a/src/include/utils/relfilenodemap.h b/src/include/utils/relfilenodemap.h index 77d8046..d324981 100644 --- a/src/include/utils/relfilenodemap.h +++ b/src/include/utils/relfilenodemap.h @@ -13,6 +13,6 @@ #ifndef RELFILENODEMAP_H #define RELFILENODEMAP_H -extern Oid RelidByRelfilenode(Oid reltablespace, Oid relfilenode); +extern Oid RelidByRelfilenode(Oid reltablespace, RelNode relfilenode); #endif /* RELFILENODEMAP_H */ diff --git a/src/include/utils/relmapper.h b/src/include/utils/relmapper.h index 9fbb5a7..58234a8 100644 --- a/src/include/utils/relmapper.h +++ b/src/include/utils/relmapper.h @@ -35,11 +35,11 @@ typedef struct xl_relmap_update #define MinSizeOfRelmapUpdate offsetof(xl_relmap_update, data) -extern Oid RelationMapOidToFilenode(Oid relationId, bool shared); +extern RelNode RelationMapOidToFilenode(Oid relationId, bool shared); -extern Oid RelationMapFilenodeToOid(Oid relationId, bool shared); +extern Oid RelationMapFilenodeToOid(RelNode relationId, bool shared); -extern void RelationMapUpdateMap(Oid relationId, Oid fileNode, bool shared, +extern void RelationMapUpdateMap(Oid relationId, RelNode fileNode, bool shared, bool immediate); extern void RelationMapRemoveMapping(Oid relationId); diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out index 16e0475..58aeddb 100644 --- a/src/test/regress/expected/alter_table.out +++ b/src/test/regress/expected/alter_table.out @@ -2164,7 +2164,6 @@ select relname, c.oid = oldoid as orig_oid, case relfilenode when 0 then 'none' - when c.oid then 'own' when oldfilenode then 'orig' else 'OTHER' end as storage, @@ -2175,10 +2174,10 @@ select relname, relname | orig_oid | storage | desc ------------------------------+----------+---------+--------------- at_partitioned | t | none | - at_partitioned_0 | t | own | - at_partitioned_0_id_name_key | t | own | child 0 index - at_partitioned_1 | t | own | - at_partitioned_1_id_name_key | t | own | child 1 index + at_partitioned_0 | t | orig | + at_partitioned_0_id_name_key | t | orig | child 0 index + at_partitioned_1 | t | orig | + at_partitioned_1_id_name_key | t | orig | child 1 index at_partitioned_id_name_key | t | none | parent index (6 rows) @@ -2198,7 +2197,6 @@ select relname, c.oid = oldoid as orig_oid, case relfilenode when 0 then 'none' - when c.oid then 'own' when oldfilenode then 'orig' else 'OTHER' end as storage, @@ -2209,10 +2207,10 @@ select relname, relname | orig_oid | storage | desc ------------------------------+----------+---------+-------------- at_partitioned | t | none | - at_partitioned_0 | t | own | - at_partitioned_0_id_name_key | f | own | parent index - at_partitioned_1 | t | own | - at_partitioned_1_id_name_key | f | own | parent index + at_partitioned_0 | t | orig | + at_partitioned_0_id_name_key | f | OTHER | parent index + at_partitioned_1 | t | orig | + at_partitioned_1_id_name_key | f | OTHER | parent index at_partitioned_id_name_key | f | none | parent index (6 rows) @@ -2556,7 +2554,7 @@ CREATE FUNCTION check_ddl_rewrite(p_tablename regclass, p_ddl text) RETURNS boolean LANGUAGE plpgsql AS $$ DECLARE - v_relfilenode oid; + v_relfilenode int8; BEGIN v_relfilenode := relfilenode FROM pg_class WHERE oid = p_tablename; diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql index ac894c0..250e6cd 100644 --- a/src/test/regress/sql/alter_table.sql +++ b/src/test/regress/sql/alter_table.sql @@ -1478,7 +1478,6 @@ select relname, c.oid = oldoid as orig_oid, case relfilenode when 0 then 'none' - when c.oid then 'own' when oldfilenode then 'orig' else 'OTHER' end as storage, @@ -1499,7 +1498,6 @@ select relname, c.oid = oldoid as orig_oid, case relfilenode when 0 then 'none' - when c.oid then 'own' when oldfilenode then 'orig' else 'OTHER' end as storage, @@ -1638,7 +1636,7 @@ CREATE FUNCTION check_ddl_rewrite(p_tablename regclass, p_ddl text) RETURNS boolean LANGUAGE plpgsql AS $$ DECLARE - v_relfilenode oid; + v_relfilenode int8; BEGIN v_relfilenode := relfilenode FROM pg_class WHERE oid = p_tablename; -- 1.8.3.1