v21-0002-Add-pg_upgarde-for-64-bit-multixact-offsets.patch
application/octet-stream
Filename: v21-0002-Add-pg_upgarde-for-64-bit-multixact-offsets.patch
Type: application/octet-stream
Part: 1
Message:
Re: POC: make mxidoff 64 bits
Patch
Same data as JSON:
GET /api/v1/attachments/:id/patch
the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes.
API reference →
Format: format-patch
Series: patch v21-0002
Subject: Add pg_upgarde for 64 bit multixact offsets
| File | + | − |
|---|---|---|
| src/backend/access/transam/multixact.c | 9 | 26 |
| src/bin/pg_upgrade/Makefile | 3 | 0 |
| src/bin/pg_upgrade/meson.build | 3 | 0 |
| src/bin/pg_upgrade/multixact_new.c | 253 | 0 |
| src/bin/pg_upgrade/multixact_new.h | 31 | 0 |
| src/bin/pg_upgrade/multixact_old.c | 296 | 0 |
| src/bin/pg_upgrade/multixact_old.h | 31 | 0 |
| src/bin/pg_upgrade/pg_upgrade.c | 102 | 6 |
| src/bin/pg_upgrade/pg_upgrade.h | 5 | 0 |
| src/bin/pg_upgrade/slru_io.c | 240 | 0 |
| src/bin/pg_upgrade/slru_io.h | 30 | 0 |
From 8f5e88b2041e062a59ceaf692880821d2316dd0f Mon Sep 17 00:00:00 2001
From: Maxim Orlov <orlovmg@gmail.com>
Date: Fri, 24 Oct 2025 10:58:37 +0300
Subject: [PATCH v21 2/3] Add pg_upgarde for 64 bit multixact offsets
Author: Maxim Orlov <orlovmg@gmail.com>
Author: Heikki Linnakangas <hlinnaka@iki.fi>
---
src/backend/access/transam/multixact.c | 35 +--
src/bin/pg_upgrade/Makefile | 3 +
src/bin/pg_upgrade/meson.build | 3 +
src/bin/pg_upgrade/multixact_new.c | 253 +++++++++++++++++++++
src/bin/pg_upgrade/multixact_new.h | 31 +++
src/bin/pg_upgrade/multixact_old.c | 296 +++++++++++++++++++++++++
src/bin/pg_upgrade/multixact_old.h | 31 +++
src/bin/pg_upgrade/pg_upgrade.c | 108 ++++++++-
src/bin/pg_upgrade/pg_upgrade.h | 5 +
src/bin/pg_upgrade/slru_io.c | 240 ++++++++++++++++++++
src/bin/pg_upgrade/slru_io.h | 30 +++
11 files changed, 1003 insertions(+), 32 deletions(-)
create mode 100644 src/bin/pg_upgrade/multixact_new.c
create mode 100644 src/bin/pg_upgrade/multixact_new.h
create mode 100644 src/bin/pg_upgrade/multixact_old.c
create mode 100644 src/bin/pg_upgrade/multixact_old.h
create mode 100644 src/bin/pg_upgrade/slru_io.c
create mode 100644 src/bin/pg_upgrade/slru_io.h
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 1f59587c42e..6a865ba2059 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -1353,7 +1353,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
int slotno;
MultiXactOffset offset;
int length;
- int truelength;
MultiXactId oldestMXact;
MultiXactId nextMXact;
MultiXactId tmpMXact;
@@ -1452,15 +1451,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
* we have just for this; the process in charge will signal the CV as soon
* as it has finished writing the multixact offset.
*
- * 3. Because GetNewMultiXactId increments offset zero to offset one to
- * handle case #2, there is an ambiguity near the point of offset
- * wraparound. If we see next multixact's offset is one, is that our
- * multixact's actual endpoint, or did it end at zero with a subsequent
- * increment? We handle this using the knowledge that if the zero'th
- * member slot wasn't filled, it'll contain zero, and zero isn't a valid
- * transaction ID so it can't be a multixact member. Therefore, if we
- * read a zero from the members array, just ignore it.
- *
* This is all pretty messy, but the mess occurs only in infrequent corner
* cases, so it seems better than holding the MultiXactGenLock for a long
* time on every multixact creation.
@@ -1544,6 +1534,9 @@ retry:
LWLockRelease(lock);
lock = NULL;
+ /* A multixid with zero members should not happen */
+ Assert(length > 0);
+
/*
* If we slept above, clean up state; it's no longer needed.
*/
@@ -1552,7 +1545,6 @@ retry:
ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));
- truelength = 0;
prev_pageno = -1;
for (int i = 0; i < length; i++, offset++)
{
@@ -1590,36 +1582,27 @@ retry:
xactptr = (TransactionId *)
(MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
- if (!TransactionIdIsValid(*xactptr))
- {
- /* Corner case 3: we must be looking at unused slot zero */
- Assert(offset == 0);
- continue;
- }
+ Assert(TransactionIdIsValid(*xactptr));
flagsoff = MXOffsetToFlagsOffset(offset);
bshift = MXOffsetToFlagsBitShift(offset);
flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
- ptr[truelength].xid = *xactptr;
- ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
- truelength++;
+ ptr[i].xid = *xactptr;
+ ptr[i].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
}
LWLockRelease(lock);
- /* A multixid with zero members should not happen */
- Assert(truelength > 0);
-
/*
* Copy the result into the local cache.
*/
- mXactCachePut(multi, truelength, ptr);
+ mXactCachePut(multi, length, ptr);
debug_elog3(DEBUG2, "GetMembers: no cache for %s",
- mxid_to_string(multi, truelength, ptr));
+ mxid_to_string(multi, length, ptr));
*members = ptr;
- return truelength;
+ return length;
}
/*
diff --git a/src/bin/pg_upgrade/Makefile b/src/bin/pg_upgrade/Makefile
index 69fcf593cae..42995d53b0b 100644
--- a/src/bin/pg_upgrade/Makefile
+++ b/src/bin/pg_upgrade/Makefile
@@ -18,11 +18,14 @@ OBJS = \
file.o \
function.o \
info.o \
+ multixact_new.o \
+ multixact_old.o \
option.o \
parallel.o \
pg_upgrade.o \
relfilenumber.o \
server.o \
+ slru_io.o \
tablespace.o \
task.o \
util.o \
diff --git a/src/bin/pg_upgrade/meson.build b/src/bin/pg_upgrade/meson.build
index ac992f0d14b..3e46c4512cf 100644
--- a/src/bin/pg_upgrade/meson.build
+++ b/src/bin/pg_upgrade/meson.build
@@ -8,11 +8,14 @@ pg_upgrade_sources = files(
'file.c',
'function.c',
'info.c',
+ 'multixact_new.c',
+ 'multixact_old.c',
'option.c',
'parallel.c',
'pg_upgrade.c',
'relfilenumber.c',
'server.c',
+ 'slru_io.c',
'tablespace.c',
'task.c',
'util.c',
diff --git a/src/bin/pg_upgrade/multixact_new.c b/src/bin/pg_upgrade/multixact_new.c
new file mode 100644
index 00000000000..d7a58a75de1
--- /dev/null
+++ b/src/bin/pg_upgrade/multixact_new.c
@@ -0,0 +1,253 @@
+/*
+ * multixact_new.c
+ *
+ * Rewrite pre-v19 multixacts to new format with 64-bit MultiXactOffsets
+ *
+ * Copyright (c) 2025, PostgreSQL Global Development Group
+ * src/bin/pg_upgrade/multixact_new.c
+ */
+
+#include "multixact_new.h"
+
+/*
+ * NOTE: Below are a bunch of definitions and simple inline functions that are
+ * copy-pasted from multixact.c
+ */
+typedef int32 ShortMultiXactOffset;
+
+/* We need four bytes per offset, 8 bytes for the base */
+#define MULTIXACT_OFFSETS_PER_PAGE \
+ ((BLCKSZ - sizeof(MultiXactOffset)) / sizeof(ShortMultiXactOffset))
+
+static inline int64
+MultiXactIdToOffsetPage(MultiXactId multi)
+{
+ return multi / MULTIXACT_OFFSETS_PER_PAGE;
+}
+
+static inline int
+MultiXactIdToOffsetEntry(MultiXactId multi)
+{
+ return multi % MULTIXACT_OFFSETS_PER_PAGE;
+}
+
+/* We need eight bits per xact, so one xact fits in a byte */
+#define MXACT_MEMBER_BITS_PER_XACT 8
+#define MXACT_MEMBER_FLAGS_PER_BYTE 1
+#define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1)
+
+/* how many full bytes of flags are there in a group? */
+#define MULTIXACT_FLAGBYTES_PER_GROUP 4
+#define MULTIXACT_MEMBERS_PER_MEMBERGROUP \
+ (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE)
+/* size in bytes of a complete group */
+#define MULTIXACT_MEMBERGROUP_SIZE \
+ (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
+#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
+#define MULTIXACT_MEMBERS_PER_PAGE \
+ (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
+
+/*
+ * Because the number of items per page is not a divisor of the last item
+ * number (member 0xFFFFFFFF), the last segment does not use the maximum number
+ * of pages, and moreover the last used page therein does not use the same
+ * number of items as previous pages. (Another way to say it is that the
+ * 0xFFFFFFFF member is somewhere in the middle of the last page, so the page
+ * has some empty space after that item.)
+ *
+ * This constant is the number of members in the last page of the last segment.
+ */
+#define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE \
+ ((uint32) ((0xFFFFFFFF % MULTIXACT_MEMBERS_PER_PAGE) + 1))
+
+/* page in which a member is to be found */
+static inline int64
+MXOffsetToMemberPage(MultiXactOffset offset)
+{
+ return offset / MULTIXACT_MEMBERS_PER_PAGE;
+}
+
+/* Location (byte offset within page) of flag word for a given member */
+static inline int
+MXOffsetToFlagsOffset(MultiXactOffset offset)
+{
+ MultiXactOffset group = offset / MULTIXACT_MEMBERS_PER_MEMBERGROUP;
+ int grouponpg = group % MULTIXACT_MEMBERGROUPS_PER_PAGE;
+ int byteoff = grouponpg * MULTIXACT_MEMBERGROUP_SIZE;
+
+ return byteoff;
+}
+
+static inline int
+MXOffsetToFlagsBitShift(MultiXactOffset offset)
+{
+ int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP;
+ int bshift = member_in_group * MXACT_MEMBER_BITS_PER_XACT;
+
+ return bshift;
+}
+
+/* Location (byte offset within page) of TransactionId of given member */
+static inline int
+MXOffsetToMemberOffset(MultiXactOffset offset)
+{
+ int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP;
+
+ return MXOffsetToFlagsOffset(offset) +
+ MULTIXACT_FLAGBYTES_PER_GROUP +
+ member_in_group * sizeof(TransactionId);
+}
+
+/*
+ * To avoid diving deep into address arithmetic, we declare an auxiliary
+ * structure that access the MultiXactOffset SLRU page.
+ */
+typedef struct MultiXactOffsetSLRUPage
+{
+ MultiXactOffset base;
+ ShortMultiXactOffset offset[FLEXIBLE_ARRAY_MEMBER];
+} MultiXactOffsetSLRUPage;
+
+static inline void
+MXOffsetWrite(char *buf, int entryno, MultiXactOffset offset)
+{
+ MultiXactOffsetSLRUPage *page = (MultiXactOffsetSLRUPage *) buf;
+
+ if (page->base != 0)
+ page->offset[entryno] = offset - page->base;
+ else
+ {
+ page->base = offset;
+ page->offset[entryno] = 0;
+ }
+
+ /*
+ * We need to distinguish between uninited value and not yet written offset.
+ * See case 2 in GetMultiXactIdMembers.
+ *
+ * So, mark this offset inited.
+ */
+ page->offset[entryno] ^= 0x80000000;
+}
+
+MultiXactWriter *
+AllocMultiXactWrite(char *pgdata, MultiXactId firstMulti,
+ MultiXactOffset firstOffset)
+{
+ MultiXactWriter *state = state = pg_malloc(sizeof(*state));
+ char dir[MAXPGPATH] = {0};
+
+ state->nextMXact = firstMulti;
+ state->nextOffset = firstOffset;
+
+ pg_sprintf(dir, "%s/pg_multixact/offsets", pgdata);
+ state->offset = AllocSlruWrite(dir, false);
+
+ pg_sprintf(dir, "%s/pg_multixact/members", pgdata);
+ state->members = AllocSlruWrite(dir, true /* use long segment names */);
+
+ return state;
+}
+
+/*
+ * Simplified copy of the corresponding server function
+ */
+MultiXactId
+GetNewMultiXactId(MultiXactWriter *state, int nmembers, MultiXactOffset *offset)
+{
+ MultiXactId result;
+
+ /* Handle wraparound of the nextMXact counter */
+ if (state->nextMXact < FirstMultiXactId)
+ state->nextMXact = FirstMultiXactId;
+
+ /* Assign the MXID */
+ result = state->nextMXact;
+
+ /* Reserve the members space, similarly to above. */
+ *offset = state->nextOffset;
+
+ /*
+ * Advance counters. As in GetNewTransactionId(), this must not happen
+ * until after file extension has succeeded!
+ *
+ * We don't care about MultiXactId wraparound here; it will be handled by
+ * the next iteration. But note that nextMXact may be InvalidMultiXactId
+ * or the first value on a segment-beginning page after this routine
+ * exits, so anyone else looking at the variable must be prepared to deal
+ * with either case. Similarly, nextOffset may be zero, but we won't use
+ * that as the actual start offset of the next multixact.
+ */
+ (state->nextMXact)++;
+
+ state->nextOffset += nmembers;
+
+ return result;
+}
+
+/*
+ * Write a new multixact with members.
+ *
+ * Simplified version of the correspoding server function, hence the name.
+ */
+void
+RecordNewMultiXact(MultiXactWriter *state, MultiXactOffset offset,
+ MultiXactId multi, int nmembers, MultiXactMember *members)
+{
+ int64 pageno;
+ int64 prev_pageno;
+ int entryno,
+ i;
+ char *buf;
+
+ pageno = MultiXactIdToOffsetPage(multi);
+ entryno = MultiXactIdToOffsetEntry(multi);
+
+ buf = SlruWriteSwitchPage(state->offset, pageno);
+ MXOffsetWrite(buf, entryno, offset);
+
+ prev_pageno = -1;
+
+ for (i = 0; i < nmembers; i++, offset++)
+ {
+ TransactionId *memberptr;
+ uint32 *flagsptr;
+ uint32 flagsval;
+ int bshift;
+ int flagsoff;
+ int memberoff;
+
+ Assert(members[i].status <= MultiXactStatusUpdate);
+
+ pageno = MXOffsetToMemberPage(offset);
+ memberoff = MXOffsetToMemberOffset(offset);
+ flagsoff = MXOffsetToFlagsOffset(offset);
+ bshift = MXOffsetToFlagsBitShift(offset);
+
+ if (pageno != prev_pageno)
+ {
+ buf = SlruWriteSwitchPage(state->members, pageno);
+ prev_pageno = pageno;
+ }
+
+ memberptr = (TransactionId *) (buf + memberoff);
+
+ *memberptr = members[i].xid;
+
+ flagsptr = (uint32 *) (buf + flagsoff);
+
+ flagsval = *flagsptr;
+ flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
+ flagsval |= (members[i].status << bshift);
+ *flagsptr = flagsval;
+ }
+}
+
+void
+FreeMultiXactWrite(MultiXactWriter *state)
+{
+ FreeSlruWrite(state->offset);
+ FreeSlruWrite(state->members);
+
+ pfree(state);
+}
diff --git a/src/bin/pg_upgrade/multixact_new.h b/src/bin/pg_upgrade/multixact_new.h
new file mode 100644
index 00000000000..33d5d1b8222
--- /dev/null
+++ b/src/bin/pg_upgrade/multixact_new.h
@@ -0,0 +1,31 @@
+/*
+ * multixact_new.h
+ *
+ * Copyright (c) 2025, PostgreSQL Global Development Group
+ * src/bin/pg_upgrade/multixact_new.h
+ */
+
+#include "postgres_fe.h"
+
+#include "access/multixact.h"
+
+#include "slru_io.h"
+
+typedef struct MultiXactWriter
+{
+ MultiXactId nextMXact;
+ MultiXactOffset nextOffset;
+
+ SlruSegState *offset;
+ SlruSegState *members;
+} MultiXactWriter;
+
+extern MultiXactWriter *AllocMultiXactWrite(char *pgdata,
+ MultiXactId firstMulti,
+ MultiXactOffset firstOffset);
+extern MultiXactId GetNewMultiXactId(MultiXactWriter *state, int nmembers,
+ MultiXactOffset *offset);
+extern void RecordNewMultiXact(MultiXactWriter *state, MultiXactOffset offset,
+ MultiXactId multi, int nmembers,
+ MultiXactMember *members);
+extern void FreeMultiXactWrite(MultiXactWriter *writer);
diff --git a/src/bin/pg_upgrade/multixact_old.c b/src/bin/pg_upgrade/multixact_old.c
new file mode 100644
index 00000000000..6cc384d2cf2
--- /dev/null
+++ b/src/bin/pg_upgrade/multixact_old.c
@@ -0,0 +1,296 @@
+/*
+ * multixact_old.c
+ *
+ * Rewrite pre-v19 multixacts to new format with 64-bit MultiXactOffsets
+ *
+ * Copyright (c) 2025, PostgreSQL Global Development Group
+ * src/bin/pg_upgrade/multixact_old.c
+ */
+
+#include "multixact_old.h"
+
+#include "pg_upgrade.h"
+
+/*
+ * NOTE: below are a bunch of definitions and simple sttaic inline functions
+ * that are copy-pasted from multixact.c from version 18. The only difference
+ * is that we use the OldMultiXactOffset type equal to uint32 instead of
+ * MultiXactOffset which became uint64.
+ */
+
+/* We need four bytes per offset and 8 bytes per base for each page. */
+#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(OldMultiXactOffset))
+
+static inline int64
+MultiXactIdToOffsetPage(MultiXactId multi)
+{
+ return multi / MULTIXACT_OFFSETS_PER_PAGE;
+}
+
+static inline int
+MultiXactIdToOffsetEntry(MultiXactId multi)
+{
+ return multi % MULTIXACT_OFFSETS_PER_PAGE;
+}
+
+/*
+ * The situation for members is a bit more complex: we store one byte of
+ * additional flag bits for each TransactionId. To do this without getting
+ * into alignment issues, we store four bytes of flags, and then the
+ * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and
+ * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups
+ * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and
+ * performance) trumps space efficiency here.
+ *
+ * Note that the "offset" macros work with byte offset, not array indexes, so
+ * arithmetic must be done using "char *" pointers.
+ */
+/* We need eight bits per xact, so one xact fits in a byte */
+#define MXACT_MEMBER_BITS_PER_XACT 8
+#define MXACT_MEMBER_FLAGS_PER_BYTE 1
+#define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1)
+
+/* how many full bytes of flags are there in a group? */
+#define MULTIXACT_FLAGBYTES_PER_GROUP 4
+#define MULTIXACT_MEMBERS_PER_MEMBERGROUP \
+ (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE)
+/* size in bytes of a complete group */
+#define MULTIXACT_MEMBERGROUP_SIZE \
+ (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
+#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
+#define MULTIXACT_MEMBERS_PER_PAGE \
+ (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
+
+/* page in which a member is to be found */
+static inline int64
+MXOffsetToMemberPage(OldMultiXactOffset offset)
+{
+ return offset / MULTIXACT_MEMBERS_PER_PAGE;
+}
+
+/* Location (byte offset within page) of flag word for a given member */
+static inline int
+MXOffsetToFlagsOffset(MultiXactOffset offset)
+{
+ OldMultiXactOffset group = offset / MULTIXACT_MEMBERS_PER_MEMBERGROUP;
+ int grouponpg = group % MULTIXACT_MEMBERGROUPS_PER_PAGE;
+ int byteoff = grouponpg * MULTIXACT_MEMBERGROUP_SIZE;
+
+ return byteoff;
+}
+
+/* Location (byte offset within page) of TransactionId of given member */
+static inline int
+MXOffsetToMemberOffset(OldMultiXactOffset offset)
+{
+ int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP;
+
+ return MXOffsetToFlagsOffset(offset) +
+ MULTIXACT_FLAGBYTES_PER_GROUP +
+ member_in_group * sizeof(TransactionId);
+}
+
+static inline int
+MXOffsetToFlagsBitShift(OldMultiXactOffset offset)
+{
+ int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP;
+ int bshift = member_in_group * MXACT_MEMBER_BITS_PER_XACT;
+
+ return bshift;
+}
+
+/*
+ * Construct reader of old multixacts.
+ *
+ * Returns the malloced memory used by the all other calls in this module.
+ */
+OldMultiXactReader *
+AllocOldMultiXactRead(char *pgdata, MultiXactId nextMulti,
+ OldMultiXactOffset nextOffset)
+{
+ OldMultiXactReader *state = state = pg_malloc(sizeof(*state));
+ char dir[MAXPGPATH] = {0};
+
+ state->nextMXact = nextMulti;
+ state->nextOffset = nextOffset;
+
+ pg_sprintf(dir, "%s/pg_multixact/offsets", pgdata);
+ state->offset = AllocSlruRead(dir);
+
+ pg_sprintf(dir, "%s/pg_multixact/members", pgdata);
+ state->members = AllocSlruRead(dir);
+
+ return state;
+}
+
+/*
+ * This is a simplified version of the GetMultiXactIdMembers() server function.
+ *
+ * - Only return the updating member, if any. Upgrade only cares about the
+ * updaters. If there is no updating member, return the first locking-only
+ * member. We don't have any way to represent "no members", but we also don't
+ * need to preserve all the locking members.
+ *
+ * - We don't need to worry about locking and some corner cases because there's
+ * no concurrent activity.
+ */
+void
+GetOldMultiXactIdSingleMember(OldMultiXactReader *state, MultiXactId multi,
+ TransactionId *result, MultiXactStatus *status)
+{
+ MultiXactId nextMXact,
+ nextOffset,
+ tmpMXact;
+ int64 pageno,
+ prev_pageno;
+ int entryno,
+ length;
+ char *buf;
+ OldMultiXactOffset *offptr,
+ offset;
+ TransactionId result_xid = InvalidTransactionId;
+ bool result_isupdate = false;
+
+ nextMXact = state->nextMXact;
+ nextOffset = state->nextOffset;
+
+ /*
+ * See GetMultiXactIdMembers in multixact.c
+ *
+ * Find out the offset at which we need to start reading MultiXactMembers
+ * and the number of members in the multixact. We determine the latter as
+ * the difference between this multixact's starting offset and the next
+ * one's. However, there are some corner cases to worry about:
+ *
+ * 1. This multixact may be the latest one created, in which case there is
+ * no next one to look at. In this case the nextOffset value we just
+ * saved is the correct endpoint.
+ *
+ * 2. The next multixact may still be in process of being filled in...
+ * This cannot happen during upgrade.
+ *
+ * 3. Because GetNewMultiXactId increments offset zero to offset one to
+ * handle case #2, there is an ambiguity near the point of offset
+ * wraparound. If we see next multixact's offset is one, is that our
+ * multixact's actual endpoint, or did it end at zero with a subsequent
+ * increment? We handle this using the knowledge that if the zero'th
+ * member slot wasn't filled, it'll contain zero, and zero isn't a valid
+ * transaction ID so it can't be a multixact member. Therefore, if we
+ * read a zero from the members array, just ignore it.
+ */
+
+ pageno = MultiXactIdToOffsetPage(multi);
+ entryno = MultiXactIdToOffsetEntry(multi);
+
+ buf = SlruReadSwitchPage(state->offset, pageno);
+ offptr = (OldMultiXactOffset *) buf;
+ offptr += entryno;
+ offset = *offptr;
+
+ Assert(offset != 0);
+
+ /*
+ * Use the same increment rule as GetNewMultiXactId(), that is, don't
+ * handle wraparound explicitly until needed.
+ */
+ tmpMXact = multi + 1;
+
+ if (nextMXact == tmpMXact)
+ {
+ /* Corner case 1: there is no next multixact */
+ length = nextOffset - offset;
+ }
+ else
+ {
+ OldMultiXactOffset nextMXOffset;
+
+ /* handle wraparound if needed */
+ if (tmpMXact < FirstMultiXactId)
+ tmpMXact = FirstMultiXactId;
+
+ prev_pageno = pageno;
+
+ pageno = MultiXactIdToOffsetPage(tmpMXact);
+ entryno = MultiXactIdToOffsetEntry(tmpMXact);
+
+ if (pageno != prev_pageno)
+ buf = SlruReadSwitchPage(state->offset, pageno);
+
+ offptr = (OldMultiXactOffset *) buf;
+ offptr += entryno;
+ nextMXOffset = *offptr;
+
+ /*
+ * Corner case 2: next multixact is still being filled in, this must
+ * not happen during upgrade.
+ */
+ Assert(nextMXOffset != 0);
+
+ length = nextMXOffset - offset;
+ }
+
+ prev_pageno = -1;
+ for (int i = 0; i < length; i++, offset++)
+ {
+ TransactionId *xactptr;
+ uint32 *flagsptr;
+ int flagsoff;
+ int bshift;
+ int memberoff;
+ MultiXactStatus st;
+
+ pageno = MXOffsetToMemberPage(offset);
+ memberoff = MXOffsetToMemberOffset(offset);
+
+ if (pageno != prev_pageno)
+ {
+ buf = SlruReadSwitchPage(state->members, pageno);
+ prev_pageno = pageno;
+ }
+
+ xactptr = (TransactionId *) (buf + memberoff);
+ if (!TransactionIdIsValid(*xactptr))
+ {
+ /* Corner case 3: we must be looking at unused slot zero */
+ Assert(offset == 0);
+ continue;
+ }
+
+ flagsoff = MXOffsetToFlagsOffset(offset);
+ bshift = MXOffsetToFlagsBitShift(offset);
+ flagsptr = (uint32 *) (buf + flagsoff);
+
+ st = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
+
+ /* Verify that there is a single update Xid among the given members. */
+ if (ISUPDATE_from_mxstatus(st))
+ {
+ if (result_isupdate)
+ pg_fatal("multixact %u has more than one updating member",
+ multi);
+ result_xid = *xactptr;
+ result_isupdate = true;
+ }
+ else if (!TransactionIdIsValid(result_xid))
+ result_xid = *xactptr;
+ }
+
+ /* A multixid with zero members should not happen */
+ Assert(TransactionIdIsValid(result_xid));
+
+ *result = result_xid;
+ *status = result_isupdate ? MultiXactStatusUpdate :
+ MultiXactStatusForKeyShare;
+}
+
+/*
+ * Frees the malloced reader.
+ */
+void
+FreeOldMultiXactReader(OldMultiXactReader *state)
+{
+ FreeSlruRead(state->offset);
+ FreeSlruRead(state->members);
+
+ pfree(state);
+}
diff --git a/src/bin/pg_upgrade/multixact_old.h b/src/bin/pg_upgrade/multixact_old.h
new file mode 100644
index 00000000000..8d4659ba6a0
--- /dev/null
+++ b/src/bin/pg_upgrade/multixact_old.h
@@ -0,0 +1,31 @@
+/*
+ * multixact_old.h
+ *
+ * Copyright (c) 2025, PostgreSQL Global Development Group
+ * src/bin/pg_upgrade/multixact_old.h
+ */
+
+#include "postgres_fe.h"
+
+#include "access/multixact.h"
+#include "slru_io.h"
+
+typedef uint32 OldMultiXactOffset;
+
+typedef struct OldMultiXactReader
+{
+ MultiXactId nextMXact;
+ OldMultiXactOffset nextOffset;
+
+ SlruSegState *offset;
+ SlruSegState *members;
+} OldMultiXactReader;
+
+extern OldMultiXactReader *AllocOldMultiXactRead(char *pgdata,
+ MultiXactId nextMulti,
+ OldMultiXactOffset nextOffset);
+extern void GetOldMultiXactIdSingleMember(OldMultiXactReader *state,
+ MultiXactId multi,
+ TransactionId *result,
+ MultiXactStatus *status);
+extern void FreeOldMultiXactReader(OldMultiXactReader *reader);
diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c
index 490e98fa26f..5432c03a2b0 100644
--- a/src/bin/pg_upgrade/pg_upgrade.c
+++ b/src/bin/pg_upgrade/pg_upgrade.c
@@ -49,6 +49,8 @@
#include "common/restricted_token.h"
#include "fe_utils/string_utils.h"
#include "pg_upgrade.h"
+#include "multixact_old.h"
+#include "multixact_new.h"
/*
* Maximum number of pg_restore actions (TOC entries) to process within one
@@ -769,6 +771,82 @@ copy_subdir_files(const char *old_subdir, const char *new_subdir)
check_ok();
}
+/*
+ * Convert pg_multixact/offset and /members to new format with 64-bit offsets.
+ */
+static void
+convert_multixacts(MultiXactId *new_nxtmulti, MultiXactOffset *new_nxtmxoff)
+{
+ MultiXactId oldest_multi,
+ next_multi;
+ OldMultiXactReader *old_reader;
+ MultiXactWriter *new_writer;
+
+ old_reader = AllocOldMultiXactRead(old_cluster.pgdata,
+ old_cluster.controldata.chkpnt_nxtmulti,
+ old_cluster.controldata.chkpnt_nxtmxoff);
+ new_writer = AllocMultiXactWrite(new_cluster.pgdata,
+ old_cluster.controldata.chkpnt_oldstMulti,
+ 1 /* see below */);
+
+ oldest_multi = old_cluster.controldata.chkpnt_oldstMulti;
+ next_multi = old_cluster.controldata.chkpnt_nxtmulti;
+
+ /* handle wraparound */
+ if (next_multi < FirstMultiXactId)
+ next_multi = FirstMultiXactId;
+
+ /*
+ * Read multixids from old files one by one, and write them back in the new
+ * format.
+ *
+ * The locking-only XIDs that may be part of multi-xids don't matter after
+ * upgrade, as there can be no transactions running across upgrade. So as
+ * a little optimization, we only read one member from each multixid: the
+ * one updating one, or if there was no update, arbitrarily the first
+ * locking xid.
+ */
+ for (MultiXactId multi = oldest_multi; multi != next_multi;)
+ {
+ TransactionId xid;
+ MultiXactStatus status;
+ MultiXactMember member;
+ MultiXactId new_multi PG_USED_FOR_ASSERTS_ONLY;
+ MultiXactOffset offset;
+
+ /* Read the old multixid */
+ GetOldMultiXactIdSingleMember(old_reader, multi, &xid, &status);
+
+ /* Write it out in new format */
+ member.xid = xid;
+ member.status = status;
+ new_multi = GetNewMultiXactId(new_writer, 1, &offset);
+
+ Assert(new_multi == multi);
+
+ RecordNewMultiXact(new_writer, offset, multi, 1, &member);
+
+ multi++;
+ /* handle wraparound */
+ if (multi < FirstMultiXactId)
+ multi = FirstMultiXactId;
+ }
+
+ /*
+ * Update the nextMXact/Offset values in the control file to match what we
+ * wrote. The nextMXact should be unchanged, but because we ignored the
+ * locking XIDs members, the nextOffset will be different.
+ */
+ Assert(new_writer->nextMXact == next_multi);
+
+ *new_nxtmulti = next_multi;
+ *new_nxtmxoff = new_writer->nextOffset;
+
+ /* Release resources */
+ FreeMultiXactWrite(new_writer);
+ FreeOldMultiXactReader(old_reader);
+}
+
static void
copy_xact_xlog_xid(void)
{
@@ -816,8 +894,28 @@ copy_xact_xlog_xid(void)
if (old_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER &&
new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
{
- copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets");
- copy_subdir_files("pg_multixact/members", "pg_multixact/members");
+ MultiXactId new_nxtmulti = old_cluster.controldata.chkpnt_nxtmulti;
+ MultiXactOffset new_nxtmxoff = old_cluster.controldata.chkpnt_nxtmxoff;
+
+ /*
+ * If the old server is before the MULTIXACTOFFSET_FORMATCHANGE_CAT_VER
+ * it must have 32-bit multixid offsets, thus it should be converted.
+ */
+ if (old_cluster.controldata.cat_ver < MULTIXACTOFFSET_FORMATCHANGE_CAT_VER &&
+ new_cluster.controldata.cat_ver >= MULTIXACTOFFSET_FORMATCHANGE_CAT_VER)
+ {
+ remove_new_subdir("pg_multixact/members", false);
+ remove_new_subdir("pg_multixact/offsets", false);
+
+ prep_status("Converting pg_multixact/offsets to 64-bit");
+ convert_multixacts(&new_nxtmulti, &new_nxtmxoff);
+ check_ok();
+ }
+ else
+ {
+ copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets");
+ copy_subdir_files("pg_multixact/members", "pg_multixact/members");
+ }
prep_status("Setting next multixact ID and offset for new cluster");
@@ -826,10 +924,8 @@ copy_xact_xlog_xid(void)
* counters here and the oldest multi present on system.
*/
exec_prog(UTILITY_LOG_FILE, NULL, true, true,
- "\"%s/pg_resetwal\" -O %u -m %u,%u \"%s\"",
- new_cluster.bindir,
- old_cluster.controldata.chkpnt_nxtmxoff,
- old_cluster.controldata.chkpnt_nxtmulti,
+ "\"%s/pg_resetwal\" -O %" PRIu64 " -m %u,%u \"%s\"",
+ new_cluster.bindir, new_nxtmxoff, new_nxtmulti,
old_cluster.controldata.chkpnt_oldstMulti,
new_cluster.pgdata);
check_ok();
diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h
index e86336f4be9..127b2cb00fa 100644
--- a/src/bin/pg_upgrade/pg_upgrade.h
+++ b/src/bin/pg_upgrade/pg_upgrade.h
@@ -114,6 +114,11 @@ extern char *output_files[];
*/
#define MULTIXACT_FORMATCHANGE_CAT_VER 201301231
+/*
+ * Swicth from 32-bit to 64-bit for multixid offsets.
+ */
+#define MULTIXACTOFFSET_FORMATCHANGE_CAT_VER 999999999
+
/*
* large object chunk size added to pg_controldata,
* commit 5f93c37805e7485488480916b4585e098d3cc883
diff --git a/src/bin/pg_upgrade/slru_io.c b/src/bin/pg_upgrade/slru_io.c
new file mode 100644
index 00000000000..4e823199303
--- /dev/null
+++ b/src/bin/pg_upgrade/slru_io.c
@@ -0,0 +1,240 @@
+/*
+ * slru_io.c
+ *
+ * Routines for reading and writing SLRU files during upgrade.
+ *
+ * Copyright (c) 2025, PostgreSQL Global Development Group
+ * src/bin/pg_upgrade/slru_io.c
+ */
+
+#include "postgres_fe.h"
+
+#include <fcntl.h>
+
+#include "pg_upgrade.h"
+#include "slru_io.h"
+
+#include "common/fe_memutils.h"
+#include "common/file_perm.h"
+#include "common/file_utils.h"
+#include "port/pg_iovec.h"
+
+/*
+ * State for reading or writing an SLRU, with a one page buffer.
+ */
+typedef struct SlruSegState
+{
+ bool writing;
+ bool long_segment_names;
+
+ char *dir;
+ char *fn;
+ int fd;
+ int64 segno;
+ uint64 pageno;
+
+ PGAlignedBlock buf;
+} SlruSegState;
+
+static inline SlruSegState *
+AllocSlruSegState(char *dir)
+{
+ SlruSegState *state = pg_malloc(sizeof(*state));
+
+ state->segno = -1;
+ state->pageno = 0;
+ state->dir = pstrdup(dir);
+ state->fd = -1;
+ state->fn = NULL;
+
+ return state;
+}
+
+static inline void
+SlruFlush(SlruSegState *state)
+{
+ struct iovec iovec = {
+ .iov_base = &state->buf,
+ .iov_len = BLCKSZ,
+ };
+ off_t offset;
+
+ if (state->segno == -1)
+ return;
+
+ offset = (state->pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ;
+
+ if (pg_pwritev_with_retry(state->fd, &iovec, 1, offset) < 0)
+ pg_fatal("could not write file \"%s\": %m", state->fn);
+}
+
+/*
+ * Create slru reader for dir.
+ *
+ * Returns the malloced memory used by the all other read calls in this module.
+ */
+SlruSegState *
+AllocSlruRead(char *dir)
+{
+ SlruSegState *state = AllocSlruSegState(dir);
+
+ state->writing = false;
+
+ return state;
+}
+
+/*
+ * Open given page for reading.
+ *
+ * Reading can be done in random order.
+ */
+char *
+SlruReadSwitchPage(SlruSegState *state, uint64 pageno)
+{
+ int64 segno;
+
+ Assert(!state->writing); /* read only mode */
+
+ if (state->segno != -1 && pageno == state->pageno)
+ return state->buf.data;
+
+ segno = pageno / SLRU_PAGES_PER_SEGMENT;
+ if (segno != state->segno)
+ {
+ if (state->segno != -1)
+ {
+ close(state->fd);
+ state->fd = -1;
+
+ pg_free(state->fn);
+ state->fn = NULL;
+ }
+
+ /* Open new segment */
+ state->fn = psprintf("%s/%04X", state->dir, (unsigned int) segno);
+ if ((state->fd = open(state->fn, O_RDONLY | PG_BINARY, 0)) < 0)
+ pg_fatal("could not open file \"%s\": %m", state->fn);
+ }
+
+ state->segno = segno;
+
+ {
+ struct iovec iovec = {
+ .iov_base = &state->buf,
+ .iov_len = BLCKSZ,
+ };
+ off_t offset = (pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ;
+
+ if (pg_preadv(state->fd, &iovec, 1, offset) < 0)
+ pg_fatal("could not read file \"%s\": %m", state->fn);
+
+ state->pageno = pageno;
+ }
+
+ return state->buf.data;
+}
+
+/*
+ * Frees the malloced reader.
+ */
+void
+FreeSlruRead(SlruSegState *state)
+{
+ Assert(!state->writing); /* read only mode */
+
+ close(state->fd);
+ pg_free(state);
+}
+
+/*
+ * Open the given page for writing.
+ *
+ * NOTE: This uses O_EXCL when stepping to a new segment, so this assumes that
+ * each segment is written in full before moving on to next one. This
+ * limitation would be easy to lift if needed, but it fits the usage pattern of
+ * current callers.
+ */
+char *
+SlruWriteSwitchPage(SlruSegState *state, uint64 pageno)
+{
+ int64 segno = pageno / SLRU_PAGES_PER_SEGMENT;
+ off_t offset;
+
+ if (state->segno != -1 && pageno == state->pageno)
+ return state->buf.data;
+
+ segno = pageno / SLRU_PAGES_PER_SEGMENT;
+ offset = (pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ;
+
+ SlruFlush(state);
+ memset(state->buf.data, 0, BLCKSZ);
+
+ if (segno != state->segno)
+ {
+ if (state->segno != -1)
+ {
+ close(state->fd);
+ state->fd = -1;
+
+ pg_free(state->fn);
+ state->fn = NULL;
+ }
+
+ /* Create the segment */
+ if (state->long_segment_names)
+ {
+ Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFFFFFFFFFFF));
+ state->fn = psprintf("%s/%015" PRIX64, state->dir, segno);
+ }
+ else
+ {
+ Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFF));
+ state->fn = psprintf("%s/%04X", state->dir, (unsigned int) segno);
+ }
+
+ if ((state->fd = open(state->fn, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+ pg_file_create_mode)) < 0)
+ {
+ pg_fatal("could not create file \"%s\": %m", state->fn);
+ }
+
+ state->segno = segno;
+
+ if (offset > 0 && pg_pwrite_zeros(state->fd, offset, 0) < 0)
+ pg_fatal("could not write file \"%s\": %m", state->fn);
+ }
+
+ state->pageno = pageno;
+
+ return state->buf.data;
+}
+
+/*
+ * Create slru writer for dir.
+ *
+ * Returns the malloced memory used by the all other write calls in this module.
+ */
+SlruSegState *
+AllocSlruWrite(char *dir, bool long_segment_names)
+{
+ SlruSegState *state = AllocSlruSegState(dir);
+
+ state->writing = true;
+ state->long_segment_names = long_segment_names;
+
+ return state;
+}
+
+/*
+ * Frees the malloced writer.
+ */
+void
+FreeSlruWrite(SlruSegState *state)
+{
+ Assert(state->writing);
+
+ SlruFlush(state);
+
+ close(state->fd);
+ pg_free(state);
+}
diff --git a/src/bin/pg_upgrade/slru_io.h b/src/bin/pg_upgrade/slru_io.h
new file mode 100644
index 00000000000..920b8ae82e2
--- /dev/null
+++ b/src/bin/pg_upgrade/slru_io.h
@@ -0,0 +1,30 @@
+/*
+ * slru_io.h
+ *
+ * Copyright (c) 2025, PostgreSQL Global Development Group
+ * src/bin/pg_upgrade/slru_io.h
+ */
+
+/*
+ * Some kind of iterator associated with a particular SLRU segment. The idea is
+ * to specify the segment and page number and then move through the pages.
+ */
+
+#include "postgres_fe.h"
+
+/*
+ * See access/slru.h
+ *
+ * Copy here, since slru.h could not be included in fe code.
+ */
+#define SLRU_PAGES_PER_SEGMENT 32
+
+typedef struct SlruSegState SlruSegState;
+
+extern SlruSegState *AllocSlruRead(char *dir);
+extern char *SlruReadSwitchPage(SlruSegState *state, uint64 pageno);
+extern void FreeSlruRead(SlruSegState *state);
+
+extern SlruSegState *AllocSlruWrite(char *dir, bool long_segment_names);
+extern char *SlruWriteSwitchPage(SlruSegState *state, uint64 pageno);
+extern void FreeSlruWrite(SlruSegState *state);
--
2.39.5 (Apple Git-154)