From 4076702a39d5ab0c0cd5af860fb47d2b0742c7ee Mon Sep 17 00:00:00 2001 From: Maxim Orlov Date: Fri, 24 Oct 2025 10:58:37 +0300 Subject: [PATCH v20 2/5] Add pg_upgarde for 64 bit multixact offsets Author: Maxim Orlov Author: Heikki Linnakangas --- src/backend/access/transam/multixact.c | 35 +-- src/bin/pg_upgrade/Makefile | 3 + src/bin/pg_upgrade/meson.build | 3 + src/bin/pg_upgrade/multixact_new.c | 227 +++++++++++++++++++ src/bin/pg_upgrade/multixact_new.h | 31 +++ src/bin/pg_upgrade/multixact_old.c | 296 +++++++++++++++++++++++++ src/bin/pg_upgrade/multixact_old.h | 31 +++ src/bin/pg_upgrade/pg_upgrade.c | 108 ++++++++- src/bin/pg_upgrade/pg_upgrade.h | 5 + src/bin/pg_upgrade/slru_io.c | 240 ++++++++++++++++++++ src/bin/pg_upgrade/slru_io.h | 30 +++ 11 files changed, 977 insertions(+), 32 deletions(-) create mode 100644 src/bin/pg_upgrade/multixact_new.c create mode 100644 src/bin/pg_upgrade/multixact_new.h create mode 100644 src/bin/pg_upgrade/multixact_old.c create mode 100644 src/bin/pg_upgrade/multixact_old.h create mode 100644 src/bin/pg_upgrade/slru_io.c create mode 100644 src/bin/pg_upgrade/slru_io.h diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 93a1e4cfd2a..5a13596cd86 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -1231,7 +1231,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, int slotno; MultiXactOffset offset; int length; - int truelength; MultiXactId oldestMXact; MultiXactId nextMXact; MultiXactId tmpMXact; @@ -1330,15 +1329,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, * we have just for this; the process in charge will signal the CV as soon * as it has finished writing the multixact offset. * - * 3. Because GetNewMultiXactId increments offset zero to offset one to - * handle case #2, there is an ambiguity near the point of offset - * wraparound. If we see next multixact's offset is one, is that our - * multixact's actual endpoint, or did it end at zero with a subsequent - * increment? We handle this using the knowledge that if the zero'th - * member slot wasn't filled, it'll contain zero, and zero isn't a valid - * transaction ID so it can't be a multixact member. Therefore, if we - * read a zero from the members array, just ignore it. - * * This is all pretty messy, but the mess occurs only in infrequent corner * cases, so it seems better than holding the MultiXactGenLock for a long * time on every multixact creation. @@ -1422,6 +1412,9 @@ retry: LWLockRelease(lock); lock = NULL; + /* A multixid with zero members should not happen */ + Assert(length > 0); + /* * If we slept above, clean up state; it's no longer needed. */ @@ -1430,7 +1423,6 @@ retry: ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember)); - truelength = 0; prev_pageno = -1; for (int i = 0; i < length; i++, offset++) { @@ -1468,36 +1460,27 @@ retry: xactptr = (TransactionId *) (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff); - if (!TransactionIdIsValid(*xactptr)) - { - /* Corner case 3: we must be looking at unused slot zero */ - Assert(offset == 0); - continue; - } + Assert(TransactionIdIsValid(*xactptr)); flagsoff = MXOffsetToFlagsOffset(offset); bshift = MXOffsetToFlagsBitShift(offset); flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff); - ptr[truelength].xid = *xactptr; - ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK; - truelength++; + ptr[i].xid = *xactptr; + ptr[i].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK; } LWLockRelease(lock); - /* A multixid with zero members should not happen */ - Assert(truelength > 0); - /* * Copy the result into the local cache. */ - mXactCachePut(multi, truelength, ptr); + mXactCachePut(multi, length, ptr); debug_elog3(DEBUG2, "GetMembers: no cache for %s", - mxid_to_string(multi, truelength, ptr)); + mxid_to_string(multi, length, ptr)); *members = ptr; - return truelength; + return length; } /* diff --git a/src/bin/pg_upgrade/Makefile b/src/bin/pg_upgrade/Makefile index 69fcf593cae..42995d53b0b 100644 --- a/src/bin/pg_upgrade/Makefile +++ b/src/bin/pg_upgrade/Makefile @@ -18,11 +18,14 @@ OBJS = \ file.o \ function.o \ info.o \ + multixact_new.o \ + multixact_old.o \ option.o \ parallel.o \ pg_upgrade.o \ relfilenumber.o \ server.o \ + slru_io.o \ tablespace.o \ task.o \ util.o \ diff --git a/src/bin/pg_upgrade/meson.build b/src/bin/pg_upgrade/meson.build index ac992f0d14b..3e46c4512cf 100644 --- a/src/bin/pg_upgrade/meson.build +++ b/src/bin/pg_upgrade/meson.build @@ -8,11 +8,14 @@ pg_upgrade_sources = files( 'file.c', 'function.c', 'info.c', + 'multixact_new.c', + 'multixact_old.c', 'option.c', 'parallel.c', 'pg_upgrade.c', 'relfilenumber.c', 'server.c', + 'slru_io.c', 'tablespace.c', 'task.c', 'util.c', diff --git a/src/bin/pg_upgrade/multixact_new.c b/src/bin/pg_upgrade/multixact_new.c new file mode 100644 index 00000000000..d43442fb9a7 --- /dev/null +++ b/src/bin/pg_upgrade/multixact_new.c @@ -0,0 +1,227 @@ +/* + * multixact_new.c + * + * Rewrite pre-v19 multixacts to new format with 64-bit MultiXactOffsets + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * src/bin/pg_upgrade/multixact_new.c + */ + +#include "multixact_new.h" + +/* + * NOTE: Below are a bunch of definitions and simple inline functions that are + * copy-pasted from multixact.c + */ + +/* We need four bytes per offset, 8 bytes for the base */ +#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset)) + +static inline int64 +MultiXactIdToOffsetPage(MultiXactId multi) +{ + return multi / MULTIXACT_OFFSETS_PER_PAGE; +} + +static inline int +MultiXactIdToOffsetEntry(MultiXactId multi) +{ + return multi % MULTIXACT_OFFSETS_PER_PAGE; +} + +/* We need eight bits per xact, so one xact fits in a byte */ +#define MXACT_MEMBER_BITS_PER_XACT 8 +#define MXACT_MEMBER_FLAGS_PER_BYTE 1 +#define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) + +/* how many full bytes of flags are there in a group? */ +#define MULTIXACT_FLAGBYTES_PER_GROUP 4 +#define MULTIXACT_MEMBERS_PER_MEMBERGROUP \ + (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE) +/* size in bytes of a complete group */ +#define MULTIXACT_MEMBERGROUP_SIZE \ + (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP) +#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE) +#define MULTIXACT_MEMBERS_PER_PAGE \ + (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP) + +/* + * Because the number of items per page is not a divisor of the last item + * number (member 0xFFFFFFFF), the last segment does not use the maximum number + * of pages, and moreover the last used page therein does not use the same + * number of items as previous pages. (Another way to say it is that the + * 0xFFFFFFFF member is somewhere in the middle of the last page, so the page + * has some empty space after that item.) + * + * This constant is the number of members in the last page of the last segment. + */ +#define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE \ + ((uint32) ((0xFFFFFFFF % MULTIXACT_MEMBERS_PER_PAGE) + 1)) + +/* page in which a member is to be found */ +static inline int64 +MXOffsetToMemberPage(MultiXactOffset offset) +{ + return offset / MULTIXACT_MEMBERS_PER_PAGE; +} + +/* Location (byte offset within page) of flag word for a given member */ +static inline int +MXOffsetToFlagsOffset(MultiXactOffset offset) +{ + MultiXactOffset group = offset / MULTIXACT_MEMBERS_PER_MEMBERGROUP; + int grouponpg = group % MULTIXACT_MEMBERGROUPS_PER_PAGE; + int byteoff = grouponpg * MULTIXACT_MEMBERGROUP_SIZE; + + return byteoff; +} + +static inline int +MXOffsetToFlagsBitShift(MultiXactOffset offset) +{ + int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP; + int bshift = member_in_group * MXACT_MEMBER_BITS_PER_XACT; + + return bshift; +} + +/* Location (byte offset within page) of TransactionId of given member */ +static inline int +MXOffsetToMemberOffset(MultiXactOffset offset) +{ + int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP; + + return MXOffsetToFlagsOffset(offset) + + MULTIXACT_FLAGBYTES_PER_GROUP + + member_in_group * sizeof(TransactionId); +} + +static inline void +MXOffsetWrite(char *buf, int entryno, MultiXactOffset offset) +{ + MultiXactOffset *offptr = (MultiXactOffset *) buf; + + offptr[entryno] = offset; +} + +MultiXactWriter * +AllocMultiXactWrite(char *pgdata, MultiXactId firstMulti, + MultiXactOffset firstOffset) +{ + MultiXactWriter *state = pg_malloc(sizeof(*state)); + char dir[MAXPGPATH] = {0}; + + state->nextMXact = firstMulti; + state->nextOffset = firstOffset; + + pg_sprintf(dir, "%s/pg_multixact/offsets", pgdata); + state->offset = AllocSlruWrite(dir, false); + + pg_sprintf(dir, "%s/pg_multixact/members", pgdata); + state->members = AllocSlruWrite(dir, true /* use long segment names */); + + return state; +} + +/* + * Simplified copy of the corresponding server function + */ +MultiXactId +GetNewMultiXactId(MultiXactWriter *state, int nmembers, MultiXactOffset *offset) +{ + MultiXactId result; + + /* Handle wraparound of the nextMXact counter */ + if (state->nextMXact < FirstMultiXactId) + state->nextMXact = FirstMultiXactId; + + /* Assign the MXID */ + result = state->nextMXact; + + /* Reserve the members space, similarly to above. */ + *offset = state->nextOffset; + + /* + * Advance counters. As in GetNewTransactionId(), this must not happen + * until after file extension has succeeded! + * + * We don't care about MultiXactId wraparound here; it will be handled by + * the next iteration. But note that nextMXact may be InvalidMultiXactId + * or the first value on a segment-beginning page after this routine + * exits, so anyone else looking at the variable must be prepared to deal + * with either case. Similarly, nextOffset may be zero, but we won't use + * that as the actual start offset of the next multixact. + */ + (state->nextMXact)++; + + state->nextOffset += nmembers; + + return result; +} + +/* + * Write a new multixact with members. + * + * Simplified version of the correspoding server function, hence the name. + */ +void +RecordNewMultiXact(MultiXactWriter *state, MultiXactOffset offset, + MultiXactId multi, int nmembers, MultiXactMember *members) +{ + int64 pageno; + int64 prev_pageno; + int entryno, + i; + char *buf; + + pageno = MultiXactIdToOffsetPage(multi); + entryno = MultiXactIdToOffsetEntry(multi); + + buf = SlruWriteSwitchPage(state->offset, pageno); + MXOffsetWrite(buf, entryno, offset); + + prev_pageno = -1; + + for (i = 0; i < nmembers; i++, offset++) + { + TransactionId *memberptr; + uint32 *flagsptr; + uint32 flagsval; + int bshift; + int flagsoff; + int memberoff; + + Assert(members[i].status <= MultiXactStatusUpdate); + + pageno = MXOffsetToMemberPage(offset); + memberoff = MXOffsetToMemberOffset(offset); + flagsoff = MXOffsetToFlagsOffset(offset); + bshift = MXOffsetToFlagsBitShift(offset); + + if (pageno != prev_pageno) + { + buf = SlruWriteSwitchPage(state->members, pageno); + prev_pageno = pageno; + } + + memberptr = (TransactionId *) (buf + memberoff); + + *memberptr = members[i].xid; + + flagsptr = (uint32 *) (buf + flagsoff); + + flagsval = *flagsptr; + flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift); + flagsval |= (members[i].status << bshift); + *flagsptr = flagsval; + } +} + +void +FreeMultiXactWrite(MultiXactWriter *state) +{ + FreeSlruWrite(state->offset); + FreeSlruWrite(state->members); + + pfree(state); +} diff --git a/src/bin/pg_upgrade/multixact_new.h b/src/bin/pg_upgrade/multixact_new.h new file mode 100644 index 00000000000..33d5d1b8222 --- /dev/null +++ b/src/bin/pg_upgrade/multixact_new.h @@ -0,0 +1,31 @@ +/* + * multixact_new.h + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * src/bin/pg_upgrade/multixact_new.h + */ + +#include "postgres_fe.h" + +#include "access/multixact.h" + +#include "slru_io.h" + +typedef struct MultiXactWriter +{ + MultiXactId nextMXact; + MultiXactOffset nextOffset; + + SlruSegState *offset; + SlruSegState *members; +} MultiXactWriter; + +extern MultiXactWriter *AllocMultiXactWrite(char *pgdata, + MultiXactId firstMulti, + MultiXactOffset firstOffset); +extern MultiXactId GetNewMultiXactId(MultiXactWriter *state, int nmembers, + MultiXactOffset *offset); +extern void RecordNewMultiXact(MultiXactWriter *state, MultiXactOffset offset, + MultiXactId multi, int nmembers, + MultiXactMember *members); +extern void FreeMultiXactWrite(MultiXactWriter *writer); diff --git a/src/bin/pg_upgrade/multixact_old.c b/src/bin/pg_upgrade/multixact_old.c new file mode 100644 index 00000000000..6cc384d2cf2 --- /dev/null +++ b/src/bin/pg_upgrade/multixact_old.c @@ -0,0 +1,296 @@ +/* + * multixact_old.c + * + * Rewrite pre-v19 multixacts to new format with 64-bit MultiXactOffsets + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * src/bin/pg_upgrade/multixact_old.c + */ + +#include "multixact_old.h" + +#include "pg_upgrade.h" + +/* + * NOTE: below are a bunch of definitions and simple sttaic inline functions + * that are copy-pasted from multixact.c from version 18. The only difference + * is that we use the OldMultiXactOffset type equal to uint32 instead of + * MultiXactOffset which became uint64. + */ + +/* We need four bytes per offset and 8 bytes per base for each page. */ +#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(OldMultiXactOffset)) + +static inline int64 +MultiXactIdToOffsetPage(MultiXactId multi) +{ + return multi / MULTIXACT_OFFSETS_PER_PAGE; +} + +static inline int +MultiXactIdToOffsetEntry(MultiXactId multi) +{ + return multi % MULTIXACT_OFFSETS_PER_PAGE; +} + +/* + * The situation for members is a bit more complex: we store one byte of + * additional flag bits for each TransactionId. To do this without getting + * into alignment issues, we store four bytes of flags, and then the + * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and + * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups + * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and + * performance) trumps space efficiency here. + * + * Note that the "offset" macros work with byte offset, not array indexes, so + * arithmetic must be done using "char *" pointers. + */ +/* We need eight bits per xact, so one xact fits in a byte */ +#define MXACT_MEMBER_BITS_PER_XACT 8 +#define MXACT_MEMBER_FLAGS_PER_BYTE 1 +#define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) + +/* how many full bytes of flags are there in a group? */ +#define MULTIXACT_FLAGBYTES_PER_GROUP 4 +#define MULTIXACT_MEMBERS_PER_MEMBERGROUP \ + (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE) +/* size in bytes of a complete group */ +#define MULTIXACT_MEMBERGROUP_SIZE \ + (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP) +#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE) +#define MULTIXACT_MEMBERS_PER_PAGE \ + (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP) + +/* page in which a member is to be found */ +static inline int64 +MXOffsetToMemberPage(OldMultiXactOffset offset) +{ + return offset / MULTIXACT_MEMBERS_PER_PAGE; +} + +/* Location (byte offset within page) of flag word for a given member */ +static inline int +MXOffsetToFlagsOffset(MultiXactOffset offset) +{ + OldMultiXactOffset group = offset / MULTIXACT_MEMBERS_PER_MEMBERGROUP; + int grouponpg = group % MULTIXACT_MEMBERGROUPS_PER_PAGE; + int byteoff = grouponpg * MULTIXACT_MEMBERGROUP_SIZE; + + return byteoff; +} + +/* Location (byte offset within page) of TransactionId of given member */ +static inline int +MXOffsetToMemberOffset(OldMultiXactOffset offset) +{ + int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP; + + return MXOffsetToFlagsOffset(offset) + + MULTIXACT_FLAGBYTES_PER_GROUP + + member_in_group * sizeof(TransactionId); +} + +static inline int +MXOffsetToFlagsBitShift(OldMultiXactOffset offset) +{ + int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP; + int bshift = member_in_group * MXACT_MEMBER_BITS_PER_XACT; + + return bshift; +} + +/* + * Construct reader of old multixacts. + * + * Returns the malloced memory used by the all other calls in this module. + */ +OldMultiXactReader * +AllocOldMultiXactRead(char *pgdata, MultiXactId nextMulti, + OldMultiXactOffset nextOffset) +{ + OldMultiXactReader *state = state = pg_malloc(sizeof(*state)); + char dir[MAXPGPATH] = {0}; + + state->nextMXact = nextMulti; + state->nextOffset = nextOffset; + + pg_sprintf(dir, "%s/pg_multixact/offsets", pgdata); + state->offset = AllocSlruRead(dir); + + pg_sprintf(dir, "%s/pg_multixact/members", pgdata); + state->members = AllocSlruRead(dir); + + return state; +} + +/* + * This is a simplified version of the GetMultiXactIdMembers() server function. + * + * - Only return the updating member, if any. Upgrade only cares about the + * updaters. If there is no updating member, return the first locking-only + * member. We don't have any way to represent "no members", but we also don't + * need to preserve all the locking members. + * + * - We don't need to worry about locking and some corner cases because there's + * no concurrent activity. + */ +void +GetOldMultiXactIdSingleMember(OldMultiXactReader *state, MultiXactId multi, + TransactionId *result, MultiXactStatus *status) +{ + MultiXactId nextMXact, + nextOffset, + tmpMXact; + int64 pageno, + prev_pageno; + int entryno, + length; + char *buf; + OldMultiXactOffset *offptr, + offset; + TransactionId result_xid = InvalidTransactionId; + bool result_isupdate = false; + + nextMXact = state->nextMXact; + nextOffset = state->nextOffset; + + /* + * See GetMultiXactIdMembers in multixact.c + * + * Find out the offset at which we need to start reading MultiXactMembers + * and the number of members in the multixact. We determine the latter as + * the difference between this multixact's starting offset and the next + * one's. However, there are some corner cases to worry about: + * + * 1. This multixact may be the latest one created, in which case there is + * no next one to look at. In this case the nextOffset value we just + * saved is the correct endpoint. + * + * 2. The next multixact may still be in process of being filled in... + * This cannot happen during upgrade. + * + * 3. Because GetNewMultiXactId increments offset zero to offset one to + * handle case #2, there is an ambiguity near the point of offset + * wraparound. If we see next multixact's offset is one, is that our + * multixact's actual endpoint, or did it end at zero with a subsequent + * increment? We handle this using the knowledge that if the zero'th + * member slot wasn't filled, it'll contain zero, and zero isn't a valid + * transaction ID so it can't be a multixact member. Therefore, if we + * read a zero from the members array, just ignore it. + */ + + pageno = MultiXactIdToOffsetPage(multi); + entryno = MultiXactIdToOffsetEntry(multi); + + buf = SlruReadSwitchPage(state->offset, pageno); + offptr = (OldMultiXactOffset *) buf; + offptr += entryno; + offset = *offptr; + + Assert(offset != 0); + + /* + * Use the same increment rule as GetNewMultiXactId(), that is, don't + * handle wraparound explicitly until needed. + */ + tmpMXact = multi + 1; + + if (nextMXact == tmpMXact) + { + /* Corner case 1: there is no next multixact */ + length = nextOffset - offset; + } + else + { + OldMultiXactOffset nextMXOffset; + + /* handle wraparound if needed */ + if (tmpMXact < FirstMultiXactId) + tmpMXact = FirstMultiXactId; + + prev_pageno = pageno; + + pageno = MultiXactIdToOffsetPage(tmpMXact); + entryno = MultiXactIdToOffsetEntry(tmpMXact); + + if (pageno != prev_pageno) + buf = SlruReadSwitchPage(state->offset, pageno); + + offptr = (OldMultiXactOffset *) buf; + offptr += entryno; + nextMXOffset = *offptr; + + /* + * Corner case 2: next multixact is still being filled in, this must + * not happen during upgrade. + */ + Assert(nextMXOffset != 0); + + length = nextMXOffset - offset; + } + + prev_pageno = -1; + for (int i = 0; i < length; i++, offset++) + { + TransactionId *xactptr; + uint32 *flagsptr; + int flagsoff; + int bshift; + int memberoff; + MultiXactStatus st; + + pageno = MXOffsetToMemberPage(offset); + memberoff = MXOffsetToMemberOffset(offset); + + if (pageno != prev_pageno) + { + buf = SlruReadSwitchPage(state->members, pageno); + prev_pageno = pageno; + } + + xactptr = (TransactionId *) (buf + memberoff); + if (!TransactionIdIsValid(*xactptr)) + { + /* Corner case 3: we must be looking at unused slot zero */ + Assert(offset == 0); + continue; + } + + flagsoff = MXOffsetToFlagsOffset(offset); + bshift = MXOffsetToFlagsBitShift(offset); + flagsptr = (uint32 *) (buf + flagsoff); + + st = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK; + + /* Verify that there is a single update Xid among the given members. */ + if (ISUPDATE_from_mxstatus(st)) + { + if (result_isupdate) + pg_fatal("multixact %u has more than one updating member", + multi); + result_xid = *xactptr; + result_isupdate = true; + } + else if (!TransactionIdIsValid(result_xid)) + result_xid = *xactptr; + } + + /* A multixid with zero members should not happen */ + Assert(TransactionIdIsValid(result_xid)); + + *result = result_xid; + *status = result_isupdate ? MultiXactStatusUpdate : + MultiXactStatusForKeyShare; +} + +/* + * Frees the malloced reader. + */ +void +FreeOldMultiXactReader(OldMultiXactReader *state) +{ + FreeSlruRead(state->offset); + FreeSlruRead(state->members); + + pfree(state); +} diff --git a/src/bin/pg_upgrade/multixact_old.h b/src/bin/pg_upgrade/multixact_old.h new file mode 100644 index 00000000000..8d4659ba6a0 --- /dev/null +++ b/src/bin/pg_upgrade/multixact_old.h @@ -0,0 +1,31 @@ +/* + * multixact_old.h + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * src/bin/pg_upgrade/multixact_old.h + */ + +#include "postgres_fe.h" + +#include "access/multixact.h" +#include "slru_io.h" + +typedef uint32 OldMultiXactOffset; + +typedef struct OldMultiXactReader +{ + MultiXactId nextMXact; + OldMultiXactOffset nextOffset; + + SlruSegState *offset; + SlruSegState *members; +} OldMultiXactReader; + +extern OldMultiXactReader *AllocOldMultiXactRead(char *pgdata, + MultiXactId nextMulti, + OldMultiXactOffset nextOffset); +extern void GetOldMultiXactIdSingleMember(OldMultiXactReader *state, + MultiXactId multi, + TransactionId *result, + MultiXactStatus *status); +extern void FreeOldMultiXactReader(OldMultiXactReader *reader); diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c index 490e98fa26f..5432c03a2b0 100644 --- a/src/bin/pg_upgrade/pg_upgrade.c +++ b/src/bin/pg_upgrade/pg_upgrade.c @@ -49,6 +49,8 @@ #include "common/restricted_token.h" #include "fe_utils/string_utils.h" #include "pg_upgrade.h" +#include "multixact_old.h" +#include "multixact_new.h" /* * Maximum number of pg_restore actions (TOC entries) to process within one @@ -769,6 +771,82 @@ copy_subdir_files(const char *old_subdir, const char *new_subdir) check_ok(); } +/* + * Convert pg_multixact/offset and /members to new format with 64-bit offsets. + */ +static void +convert_multixacts(MultiXactId *new_nxtmulti, MultiXactOffset *new_nxtmxoff) +{ + MultiXactId oldest_multi, + next_multi; + OldMultiXactReader *old_reader; + MultiXactWriter *new_writer; + + old_reader = AllocOldMultiXactRead(old_cluster.pgdata, + old_cluster.controldata.chkpnt_nxtmulti, + old_cluster.controldata.chkpnt_nxtmxoff); + new_writer = AllocMultiXactWrite(new_cluster.pgdata, + old_cluster.controldata.chkpnt_oldstMulti, + 1 /* see below */); + + oldest_multi = old_cluster.controldata.chkpnt_oldstMulti; + next_multi = old_cluster.controldata.chkpnt_nxtmulti; + + /* handle wraparound */ + if (next_multi < FirstMultiXactId) + next_multi = FirstMultiXactId; + + /* + * Read multixids from old files one by one, and write them back in the new + * format. + * + * The locking-only XIDs that may be part of multi-xids don't matter after + * upgrade, as there can be no transactions running across upgrade. So as + * a little optimization, we only read one member from each multixid: the + * one updating one, or if there was no update, arbitrarily the first + * locking xid. + */ + for (MultiXactId multi = oldest_multi; multi != next_multi;) + { + TransactionId xid; + MultiXactStatus status; + MultiXactMember member; + MultiXactId new_multi PG_USED_FOR_ASSERTS_ONLY; + MultiXactOffset offset; + + /* Read the old multixid */ + GetOldMultiXactIdSingleMember(old_reader, multi, &xid, &status); + + /* Write it out in new format */ + member.xid = xid; + member.status = status; + new_multi = GetNewMultiXactId(new_writer, 1, &offset); + + Assert(new_multi == multi); + + RecordNewMultiXact(new_writer, offset, multi, 1, &member); + + multi++; + /* handle wraparound */ + if (multi < FirstMultiXactId) + multi = FirstMultiXactId; + } + + /* + * Update the nextMXact/Offset values in the control file to match what we + * wrote. The nextMXact should be unchanged, but because we ignored the + * locking XIDs members, the nextOffset will be different. + */ + Assert(new_writer->nextMXact == next_multi); + + *new_nxtmulti = next_multi; + *new_nxtmxoff = new_writer->nextOffset; + + /* Release resources */ + FreeMultiXactWrite(new_writer); + FreeOldMultiXactReader(old_reader); +} + static void copy_xact_xlog_xid(void) { @@ -816,8 +894,28 @@ copy_xact_xlog_xid(void) if (old_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER && new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER) { - copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets"); - copy_subdir_files("pg_multixact/members", "pg_multixact/members"); + MultiXactId new_nxtmulti = old_cluster.controldata.chkpnt_nxtmulti; + MultiXactOffset new_nxtmxoff = old_cluster.controldata.chkpnt_nxtmxoff; + + /* + * If the old server is before the MULTIXACTOFFSET_FORMATCHANGE_CAT_VER + * it must have 32-bit multixid offsets, thus it should be converted. + */ + if (old_cluster.controldata.cat_ver < MULTIXACTOFFSET_FORMATCHANGE_CAT_VER && + new_cluster.controldata.cat_ver >= MULTIXACTOFFSET_FORMATCHANGE_CAT_VER) + { + remove_new_subdir("pg_multixact/members", false); + remove_new_subdir("pg_multixact/offsets", false); + + prep_status("Converting pg_multixact/offsets to 64-bit"); + convert_multixacts(&new_nxtmulti, &new_nxtmxoff); + check_ok(); + } + else + { + copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets"); + copy_subdir_files("pg_multixact/members", "pg_multixact/members"); + } prep_status("Setting next multixact ID and offset for new cluster"); @@ -826,10 +924,8 @@ copy_xact_xlog_xid(void) * counters here and the oldest multi present on system. */ exec_prog(UTILITY_LOG_FILE, NULL, true, true, - "\"%s/pg_resetwal\" -O %u -m %u,%u \"%s\"", - new_cluster.bindir, - old_cluster.controldata.chkpnt_nxtmxoff, - old_cluster.controldata.chkpnt_nxtmulti, + "\"%s/pg_resetwal\" -O %" PRIu64 " -m %u,%u \"%s\"", + new_cluster.bindir, new_nxtmxoff, new_nxtmulti, old_cluster.controldata.chkpnt_oldstMulti, new_cluster.pgdata); check_ok(); diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index e86336f4be9..127b2cb00fa 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -114,6 +114,11 @@ extern char *output_files[]; */ #define MULTIXACT_FORMATCHANGE_CAT_VER 201301231 +/* + * Swicth from 32-bit to 64-bit for multixid offsets. + */ +#define MULTIXACTOFFSET_FORMATCHANGE_CAT_VER 999999999 + /* * large object chunk size added to pg_controldata, * commit 5f93c37805e7485488480916b4585e098d3cc883 diff --git a/src/bin/pg_upgrade/slru_io.c b/src/bin/pg_upgrade/slru_io.c new file mode 100644 index 00000000000..4e823199303 --- /dev/null +++ b/src/bin/pg_upgrade/slru_io.c @@ -0,0 +1,240 @@ +/* + * slru_io.c + * + * Routines for reading and writing SLRU files during upgrade. + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * src/bin/pg_upgrade/slru_io.c + */ + +#include "postgres_fe.h" + +#include + +#include "pg_upgrade.h" +#include "slru_io.h" + +#include "common/fe_memutils.h" +#include "common/file_perm.h" +#include "common/file_utils.h" +#include "port/pg_iovec.h" + +/* + * State for reading or writing an SLRU, with a one page buffer. + */ +typedef struct SlruSegState +{ + bool writing; + bool long_segment_names; + + char *dir; + char *fn; + int fd; + int64 segno; + uint64 pageno; + + PGAlignedBlock buf; +} SlruSegState; + +static inline SlruSegState * +AllocSlruSegState(char *dir) +{ + SlruSegState *state = pg_malloc(sizeof(*state)); + + state->segno = -1; + state->pageno = 0; + state->dir = pstrdup(dir); + state->fd = -1; + state->fn = NULL; + + return state; +} + +static inline void +SlruFlush(SlruSegState *state) +{ + struct iovec iovec = { + .iov_base = &state->buf, + .iov_len = BLCKSZ, + }; + off_t offset; + + if (state->segno == -1) + return; + + offset = (state->pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ; + + if (pg_pwritev_with_retry(state->fd, &iovec, 1, offset) < 0) + pg_fatal("could not write file \"%s\": %m", state->fn); +} + +/* + * Create slru reader for dir. + * + * Returns the malloced memory used by the all other read calls in this module. + */ +SlruSegState * +AllocSlruRead(char *dir) +{ + SlruSegState *state = AllocSlruSegState(dir); + + state->writing = false; + + return state; +} + +/* + * Open given page for reading. + * + * Reading can be done in random order. + */ +char * +SlruReadSwitchPage(SlruSegState *state, uint64 pageno) +{ + int64 segno; + + Assert(!state->writing); /* read only mode */ + + if (state->segno != -1 && pageno == state->pageno) + return state->buf.data; + + segno = pageno / SLRU_PAGES_PER_SEGMENT; + if (segno != state->segno) + { + if (state->segno != -1) + { + close(state->fd); + state->fd = -1; + + pg_free(state->fn); + state->fn = NULL; + } + + /* Open new segment */ + state->fn = psprintf("%s/%04X", state->dir, (unsigned int) segno); + if ((state->fd = open(state->fn, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("could not open file \"%s\": %m", state->fn); + } + + state->segno = segno; + + { + struct iovec iovec = { + .iov_base = &state->buf, + .iov_len = BLCKSZ, + }; + off_t offset = (pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ; + + if (pg_preadv(state->fd, &iovec, 1, offset) < 0) + pg_fatal("could not read file \"%s\": %m", state->fn); + + state->pageno = pageno; + } + + return state->buf.data; +} + +/* + * Frees the malloced reader. + */ +void +FreeSlruRead(SlruSegState *state) +{ + Assert(!state->writing); /* read only mode */ + + close(state->fd); + pg_free(state); +} + +/* + * Open the given page for writing. + * + * NOTE: This uses O_EXCL when stepping to a new segment, so this assumes that + * each segment is written in full before moving on to next one. This + * limitation would be easy to lift if needed, but it fits the usage pattern of + * current callers. + */ +char * +SlruWriteSwitchPage(SlruSegState *state, uint64 pageno) +{ + int64 segno = pageno / SLRU_PAGES_PER_SEGMENT; + off_t offset; + + if (state->segno != -1 && pageno == state->pageno) + return state->buf.data; + + segno = pageno / SLRU_PAGES_PER_SEGMENT; + offset = (pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ; + + SlruFlush(state); + memset(state->buf.data, 0, BLCKSZ); + + if (segno != state->segno) + { + if (state->segno != -1) + { + close(state->fd); + state->fd = -1; + + pg_free(state->fn); + state->fn = NULL; + } + + /* Create the segment */ + if (state->long_segment_names) + { + Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFFFFFFFFFFF)); + state->fn = psprintf("%s/%015" PRIX64, state->dir, segno); + } + else + { + Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFF)); + state->fn = psprintf("%s/%04X", state->dir, (unsigned int) segno); + } + + if ((state->fd = open(state->fn, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + { + pg_fatal("could not create file \"%s\": %m", state->fn); + } + + state->segno = segno; + + if (offset > 0 && pg_pwrite_zeros(state->fd, offset, 0) < 0) + pg_fatal("could not write file \"%s\": %m", state->fn); + } + + state->pageno = pageno; + + return state->buf.data; +} + +/* + * Create slru writer for dir. + * + * Returns the malloced memory used by the all other write calls in this module. + */ +SlruSegState * +AllocSlruWrite(char *dir, bool long_segment_names) +{ + SlruSegState *state = AllocSlruSegState(dir); + + state->writing = true; + state->long_segment_names = long_segment_names; + + return state; +} + +/* + * Frees the malloced writer. + */ +void +FreeSlruWrite(SlruSegState *state) +{ + Assert(state->writing); + + SlruFlush(state); + + close(state->fd); + pg_free(state); +} diff --git a/src/bin/pg_upgrade/slru_io.h b/src/bin/pg_upgrade/slru_io.h new file mode 100644 index 00000000000..920b8ae82e2 --- /dev/null +++ b/src/bin/pg_upgrade/slru_io.h @@ -0,0 +1,30 @@ +/* + * slru_io.h + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * src/bin/pg_upgrade/slru_io.h + */ + +/* + * Some kind of iterator associated with a particular SLRU segment. The idea is + * to specify the segment and page number and then move through the pages. + */ + +#include "postgres_fe.h" + +/* + * See access/slru.h + * + * Copy here, since slru.h could not be included in fe code. + */ +#define SLRU_PAGES_PER_SEGMENT 32 + +typedef struct SlruSegState SlruSegState; + +extern SlruSegState *AllocSlruRead(char *dir); +extern char *SlruReadSwitchPage(SlruSegState *state, uint64 pageno); +extern void FreeSlruRead(SlruSegState *state); + +extern SlruSegState *AllocSlruWrite(char *dir, bool long_segment_names); +extern char *SlruWriteSwitchPage(SlruSegState *state, uint64 pageno); +extern void FreeSlruWrite(SlruSegState *state); -- 2.43.0