v10-0010-rewrite-pg_upgrade-code.patch
text/x-patch
Filename: v10-0010-rewrite-pg_upgrade-code.patch
Type: text/x-patch
Part: 9
Message:
Re: POC: make mxidoff 64 bits
Patch
Same data as JSON:
GET /api/v1/attachments/:id/patch
the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes.
API reference →
Format: format-patch
Series: patch v10-0010
Subject: rewrite pg_upgrade code
| File | + | − |
|---|---|---|
| src/backend/access/transam/multixact.c | 9 | 27 |
| src/bin/pg_upgrade/Makefile | 3 | 1 |
| src/bin/pg_upgrade/meson.build | 3 | 1 |
| src/bin/pg_upgrade/multixact_old.c | 340 | 0 |
| src/bin/pg_upgrade/multixact_old.h | 12 | 0 |
| src/bin/pg_upgrade/multixact_rewrite.c | 238 | 0 |
| src/bin/pg_upgrade/pg_upgrade.c | 7 | 22 |
| src/bin/pg_upgrade/pg_upgrade.h | 2 | 3 |
| src/bin/pg_upgrade/segresize.c | 0 | 527 |
| src/bin/pg_upgrade/slru_io.c | 214 | 0 |
| src/bin/pg_upgrade/slru_io.h | 23 | 0 |
From 6cc69b50f677a08c72e7a10fd043f2c0af7072bc Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Wed, 18 Dec 2024 01:07:03 +0200
Subject: [PATCH v10 10/14] rewrite pg_upgrade code
---
src/backend/access/transam/multixact.c | 36 +-
src/bin/pg_upgrade/Makefile | 4 +-
src/bin/pg_upgrade/meson.build | 4 +-
src/bin/pg_upgrade/multixact_old.c | 340 ++++++++++++++++
src/bin/pg_upgrade/multixact_old.h | 12 +
src/bin/pg_upgrade/multixact_rewrite.c | 238 +++++++++++
src/bin/pg_upgrade/pg_upgrade.c | 29 +-
src/bin/pg_upgrade/pg_upgrade.h | 5 +-
src/bin/pg_upgrade/segresize.c | 527 -------------------------
src/bin/pg_upgrade/slru_io.c | 214 ++++++++++
src/bin/pg_upgrade/slru_io.h | 23 ++
11 files changed, 851 insertions(+), 581 deletions(-)
create mode 100644 src/bin/pg_upgrade/multixact_old.c
create mode 100644 src/bin/pg_upgrade/multixact_old.h
create mode 100644 src/bin/pg_upgrade/multixact_rewrite.c
delete mode 100644 src/bin/pg_upgrade/segresize.c
create mode 100644 src/bin/pg_upgrade/slru_io.c
create mode 100644 src/bin/pg_upgrade/slru_io.h
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index b786ee23563..ea09f8606cf 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -1103,7 +1103,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
MultiXactOffset *offptr;
MultiXactOffset offset;
int length;
- int truelength;
MultiXactId oldestMXact;
MultiXactId nextMXact;
MultiXactId tmpMXact;
@@ -1202,16 +1201,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
* we have just for this; the process in charge will signal the CV as soon
* as it has finished writing the multixact offset.
*
- * FIXME: case 3 is now only needed for pg_upgraded clusters
- * 3. Because GetNewMultiXactId increments offset zero to offset one to
- * handle case #2, there is an ambiguity near the point of offset
- * wraparound. If we see next multixact's offset is one, is that our
- * multixact's actual endpoint, or did it end at zero with a subsequent
- * increment? We handle this using the knowledge that if the zero'th
- * member slot wasn't filled, it'll contain zero, and zero isn't a valid
- * transaction ID so it can't be a multixact member. Therefore, if we
- * read a zero from the members array, just ignore it.
- *
* This is all pretty messy, but the mess occurs only in infrequent corner
* cases, so it seems better than holding the MultiXactGenLock for a long
* time on every multixact creation.
@@ -1298,6 +1287,9 @@ retry:
LWLockRelease(lock);
lock = NULL;
+ /* A multixid with zero members should not happen */
+ Assert(length > 0);
+
/*
* If we slept above, clean up state; it's no longer needed.
*/
@@ -1306,7 +1298,6 @@ retry:
ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));
- truelength = 0;
prev_pageno = -1;
for (int i = 0; i < length; i++, offset++)
{
@@ -1344,36 +1335,27 @@ retry:
xactptr = (TransactionId *)
(MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
- if (!TransactionIdIsValid(*xactptr))
- {
- /* Corner case 3: we must be looking at unused slot zero */
- Assert(offset == 0);
- continue;
- }
+ Assert(TransactionIdIsValid(*xactptr));
flagsoff = MXOffsetToFlagsOffset(offset);
bshift = MXOffsetToFlagsBitShift(offset);
flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
- ptr[truelength].xid = *xactptr;
- ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
- truelength++;
+ ptr[i].xid = *xactptr;
+ ptr[i].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
}
LWLockRelease(lock);
- /* A multixid with zero members should not happen */
- Assert(truelength > 0);
-
/*
* Copy the result into the local cache.
*/
- mXactCachePut(multi, truelength, ptr);
+ mXactCachePut(multi, length, ptr);
debug_elog3(DEBUG2, "GetMembers: no cache for %s",
- mxid_to_string(multi, truelength, ptr));
+ mxid_to_string(multi, length, ptr));
*members = ptr;
- return truelength;
+ return length;
}
/*
diff --git a/src/bin/pg_upgrade/Makefile b/src/bin/pg_upgrade/Makefile
index 70908d63a31..b4ad01c00b2 100644
--- a/src/bin/pg_upgrade/Makefile
+++ b/src/bin/pg_upgrade/Makefile
@@ -19,12 +19,14 @@ OBJS = \
file.o \
function.o \
info.o \
+ multixact_old.o \
+ multixact_rewrite.o \
option.o \
parallel.o \
- segresize.o \
pg_upgrade.o \
relfilenumber.o \
server.o \
+ slru_io.o \
tablespace.o \
task.o \
util.o \
diff --git a/src/bin/pg_upgrade/meson.build b/src/bin/pg_upgrade/meson.build
index 16f898ba148..2dffc48b3d2 100644
--- a/src/bin/pg_upgrade/meson.build
+++ b/src/bin/pg_upgrade/meson.build
@@ -8,12 +8,14 @@ pg_upgrade_sources = files(
'file.c',
'function.c',
'info.c',
+ 'multixact_old.c',
+ 'multixact_rewrite.c',
'option.c',
'parallel.c',
- 'segresize.c',
'pg_upgrade.c',
'relfilenumber.c',
'server.c',
+ 'slru_io.c',
'tablespace.c',
'task.c',
'util.c',
diff --git a/src/bin/pg_upgrade/multixact_old.c b/src/bin/pg_upgrade/multixact_old.c
new file mode 100644
index 00000000000..14988c105ce
--- /dev/null
+++ b/src/bin/pg_upgrade/multixact_old.c
@@ -0,0 +1,340 @@
+/*
+ * multixact_old.c
+ *
+ * Support for reading pre-v18 format pg_multixact files
+ *
+ * Copyright (c) 2024, PostgreSQL Global Development Group
+ * src/bin/pg_upgrade/multixact_old.c
+ */
+
+#include "postgres_fe.h"
+
+#include "access/transam.h"
+#include "pg_upgrade.h"
+#include "multixact_old.h"
+#include "slru_io.h"
+
+/*
+ * Below are a bunch of definitions that are copy-pasted from multixact.c from
+ * version 17. They shadow the new definitions in access/multixact.h, so it's
+ * important that we *don't* include that here. That's is a big reason this
+ * code has to be in a separate source file.
+ *
+ * All references to MultiXactOffset have been replaced with OldMultiXactOffset;
+ */
+typedef uint32 OldMultiXactOffset;
+
+#define FirstMultiXactId ((MultiXactId) 1)
+
+/*
+ * Possible multixact lock modes ("status"). The first four modes are for
+ * tuple locks (FOR KEY SHARE, FOR SHARE, FOR NO KEY UPDATE, FOR UPDATE); the
+ * next two are used for update and delete modes.
+ */
+typedef enum
+{
+ MultiXactStatusForKeyShare = 0x00,
+ MultiXactStatusForShare = 0x01,
+ MultiXactStatusForNoKeyUpdate = 0x02,
+ MultiXactStatusForUpdate = 0x03,
+ /* an update that doesn't touch "key" columns */
+ MultiXactStatusNoKeyUpdate = 0x04,
+ /* other updates, and delete */
+ MultiXactStatusUpdate = 0x05,
+} MultiXactStatus;
+
+/* does a status value correspond to a tuple update? */
+#define ISUPDATE_from_mxstatus(status) \
+ ((status) > MultiXactStatusForUpdate)
+
+/*
+ * Defines for OldMultiXactOffset page sizes. A page is the same BLCKSZ as is
+ * used everywhere else in Postgres.
+ *
+ * Note: because OldMultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF,
+ * MultiXact page numbering also wraps around at
+ * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at
+ * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need
+ * take no explicit notice of that fact in this module, except when comparing
+ * segment and page numbers in TruncateMultiXact (see
+ * OldMultiXactOffsetPagePrecedes).
+ */
+
+/* We need four bytes per offset */
+#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(uint32))
+
+static inline int64
+MultiXactIdToOffsetPage(MultiXactId multi)
+{
+ return multi / MULTIXACT_OFFSETS_PER_PAGE;
+}
+
+static inline int
+MultiXactIdToOffsetEntry(MultiXactId multi)
+{
+ return multi % MULTIXACT_OFFSETS_PER_PAGE;
+}
+
+static inline int64
+MultiXactIdToOffsetSegment(MultiXactId multi)
+{
+ return MultiXactIdToOffsetPage(multi) / SLRU_PAGES_PER_SEGMENT;
+}
+
+/*
+ * The situation for members is a bit more complex: we store one byte of
+ * additional flag bits for each TransactionId. To do this without getting
+ * into alignment issues, we store four bytes of flags, and then the
+ * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and
+ * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups
+ * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and
+ * performance) trumps space efficiency here.
+ *
+ * Note that the "offset" macros work with byte offset, not array indexes, so
+ * arithmetic must be done using "char *" pointers.
+ */
+/* We need eight bits per xact, so one xact fits in a byte */
+#define MXACT_MEMBER_BITS_PER_XACT 8
+#define MXACT_MEMBER_FLAGS_PER_BYTE 1
+#define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1)
+
+/* how many full bytes of flags are there in a group? */
+#define MULTIXACT_FLAGBYTES_PER_GROUP 4
+#define MULTIXACT_MEMBERS_PER_MEMBERGROUP \
+ (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE)
+/* size in bytes of a complete group */
+#define MULTIXACT_MEMBERGROUP_SIZE \
+ (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
+#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
+#define MULTIXACT_MEMBERS_PER_PAGE \
+ (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
+
+/* page in which a member is to be found */
+static inline int64
+MXOffsetToMemberPage(OldMultiXactOffset offset)
+{
+ return offset / MULTIXACT_MEMBERS_PER_PAGE;
+}
+
+/* Location (byte offset within page) of flag word for a given member */
+static inline int
+MXOffsetToFlagsOffset(OldMultiXactOffset offset)
+{
+ OldMultiXactOffset group = offset / MULTIXACT_MEMBERS_PER_MEMBERGROUP;
+ int grouponpg = group % MULTIXACT_MEMBERGROUPS_PER_PAGE;
+ int byteoff = grouponpg * MULTIXACT_MEMBERGROUP_SIZE;
+
+ return byteoff;
+}
+
+static inline int
+MXOffsetToFlagsBitShift(OldMultiXactOffset offset)
+{
+ int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP;
+ int bshift = member_in_group * MXACT_MEMBER_BITS_PER_XACT;
+
+ return bshift;
+}
+
+/* Location (byte offset within page) of TransactionId of given member */
+static inline int
+MXOffsetToMemberOffset(OldMultiXactOffset offset)
+{
+ int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP;
+
+ return MXOffsetToFlagsOffset(offset) +
+ MULTIXACT_FLAGBYTES_PER_GROUP +
+ member_in_group * sizeof(TransactionId);
+}
+
+typedef struct OldMultiXactReader
+{
+ MultiXactId nextMXact;
+ uint32 nextOffset;
+
+ SlruSegState *offset;
+ SlruSegState *members;
+} OldMultiXactReader;
+
+OldMultiXactReader *
+StartOldMultiXactRead(void)
+{
+ OldMultiXactReader *state;
+ char *dir;
+
+ state = pg_malloc(sizeof(OldMultiXactReader));
+ state->nextMXact = old_cluster.controldata.chkpnt_nxtmulti;
+ state->nextOffset = old_cluster.controldata.chkpnt_nxtmxoff;
+
+ dir = psprintf("%s/pg_multixact/offsets", old_cluster.pgdata);
+ state->offset = OpenSlruRead(dir);
+ pg_free(dir);
+
+ dir = psprintf("%s/pg_multixact/members", old_cluster.pgdata);
+ state->members = OpenSlruRead(dir);
+ pg_free(dir);
+
+ return state;
+}
+
+/*
+ * This is a simplified version of the GetMultiXactIdMembers() server function.
+ *
+ * - Only return the updating member, if any. Upgrade only cares about the updaters.
+ * If there is no updating member, return the first locking-only member. We don't
+ * have any way to represent "no members", but we also don't need to preserve all
+ * the locking members.
+ *
+ * - We don't need to worry about locking and some corner cases because there's
+ * no concurrent activity.
+ */
+void
+GetOldMultiXactIdSingleMember(OldMultiXactReader *state, MultiXactId multi,
+ TransactionId *result, bool *isupdate)
+{
+ TransactionId result_xid;
+ bool result_isupdate;
+ int64 pageno;
+ int64 prev_pageno;
+ int entryno;
+ OldMultiXactOffset *offptr;
+ OldMultiXactOffset offset;
+ int length;
+ MultiXactId nextMXact;
+ MultiXactId tmpMXact;
+ OldMultiXactOffset nextOffset;
+ char *buf;
+
+ nextMXact = state->nextMXact;
+ nextOffset = state->nextOffset;
+
+ /*
+ * Find out the offset at which we need to start reading MultiXactMembers
+ * and the number of members in the multixact. We determine the latter as
+ * the difference between this multixact's starting offset and the next
+ * one's. However, there are some corner cases to worry about:
+ *
+ * 1. This multixact may be the latest one created, in which case there is
+ * no next one to look at. In this case the nextOffset value we just
+ * saved is the correct endpoint.
+ *
+ * 2. (this cannot happen during upgrade)
+ *
+ * 3. Because GetNewMultiXactId increments offset zero to offset one to
+ * handle case #2, there is an ambiguity near the point of offset
+ * wraparound. If we see next multixact's offset is one, is that our
+ * multixact's actual endpoint, or did it end at zero with a subsequent
+ * increment? We handle this using the knowledge that if the zero'th
+ * member slot wasn't filled, it'll contain zero, and zero isn't a valid
+ * transaction ID so it can't be a multixact member. Therefore, if we
+ * read a zero from the members array, just ignore it.
+ */
+ pageno = MultiXactIdToOffsetPage(multi);
+ entryno = MultiXactIdToOffsetEntry(multi);
+
+ buf = SlruReadSwitchPage(state->offset, pageno);
+ offptr = (OldMultiXactOffset *) buf;
+ offptr += entryno;
+ offset = *offptr;
+
+ Assert(offset != 0);
+
+ /*
+ * Use the same increment rule as GetNewMultiXactId(), that is, don't
+ * handle wraparound explicitly until needed.
+ */
+ tmpMXact = multi + 1;
+
+ if (nextMXact == tmpMXact)
+ {
+ /* Corner case 1: there is no next multixact */
+ length = nextOffset - offset;
+ }
+ else
+ {
+ OldMultiXactOffset nextMXOffset;
+
+ /* handle wraparound if needed */
+ if (tmpMXact < FirstMultiXactId)
+ tmpMXact = FirstMultiXactId;
+
+ prev_pageno = pageno;
+
+ pageno = MultiXactIdToOffsetPage(tmpMXact);
+ entryno = MultiXactIdToOffsetEntry(tmpMXact);
+
+ if (pageno != prev_pageno)
+ {
+ buf = SlruReadSwitchPage(state->offset, pageno);
+ }
+
+ offptr = (OldMultiXactOffset *) buf;
+ offptr += entryno;
+ nextMXOffset = *offptr;
+
+ if (nextMXOffset == 0)
+ {
+ /* Corner case 2: next multixact is still being filled in */
+ Assert(false); /* shouldn't happen during upgrade */
+ }
+
+ length = nextMXOffset - offset;
+ }
+
+ result_xid = InvalidTransactionId;
+ result_isupdate = false;
+ prev_pageno = -1;
+ for (int i = 0; i < length; i++, offset++)
+ {
+ TransactionId *xactptr;
+ uint32 *flagsptr;
+ int flagsoff;
+ int bshift;
+ int memberoff;
+ MultiXactStatus status;
+
+ pageno = MXOffsetToMemberPage(offset);
+ memberoff = MXOffsetToMemberOffset(offset);
+
+ if (pageno != prev_pageno)
+ {
+ buf = SlruReadSwitchPage(state->members, pageno);
+ prev_pageno = pageno;
+ }
+
+ xactptr = (TransactionId *) (buf + memberoff);
+
+ if (!TransactionIdIsValid(*xactptr))
+ {
+ /* Corner case 3: we must be looking at unused slot zero */
+ Assert(offset == 0);
+ continue;
+ }
+
+ flagsoff = MXOffsetToFlagsOffset(offset);
+ bshift = MXOffsetToFlagsBitShift(offset);
+ flagsptr = (uint32 *) (buf + flagsoff);
+
+ status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
+
+ /* Verify that there is a single update Xid among the given members. */
+ if (ISUPDATE_from_mxstatus(status))
+ {
+ if (result_isupdate)
+ pg_fatal("multixact %u has more than one updating member",
+ multi);
+ result_xid = *xactptr;
+ result_isupdate = true;
+ }
+ else if (!TransactionIdIsValid(result_xid))
+ result_xid = *xactptr;
+ }
+
+ /* A multixid with zero members should not happen */
+ Assert(TransactionIdIsValid(result_xid));
+
+ *result = result_xid;
+ *isupdate = result_isupdate;
+}
+
+
diff --git a/src/bin/pg_upgrade/multixact_old.h b/src/bin/pg_upgrade/multixact_old.h
new file mode 100644
index 00000000000..70800c1cda5
--- /dev/null
+++ b/src/bin/pg_upgrade/multixact_old.h
@@ -0,0 +1,12 @@
+/*
+ * multixact_old.h
+ *
+ * Copyright (c) 2010-2024, PostgreSQL Global Development Group
+ * src/bin/pg_upgrade/multixact_old.h
+ */
+
+typedef struct OldMultiXactReader OldMultiXactReader;
+
+extern OldMultiXactReader *StartOldMultiXactRead(void);
+extern void GetOldMultiXactIdSingleMember(OldMultiXactReader *state, MultiXactId multi,
+ TransactionId *result, bool *isupdate);
diff --git a/src/bin/pg_upgrade/multixact_rewrite.c b/src/bin/pg_upgrade/multixact_rewrite.c
new file mode 100644
index 00000000000..7b3aeb80c0b
--- /dev/null
+++ b/src/bin/pg_upgrade/multixact_rewrite.c
@@ -0,0 +1,238 @@
+/*
+ * multixact_rewrite.c
+ *
+ * Rewrite pre-v18 multixacts to new format with 64-bit MultiXactOffsets
+ *
+ * Copyright (c) 2024, PostgreSQL Global Development Group
+ * src/bin/pg_upgrade/multixact_rewrite.c
+ */
+
+#include "postgres_fe.h"
+
+#include "multixact_old.h"
+#include "pg_upgrade.h"
+#include "slru_io.h"
+
+#include "access/multixact.h"
+#include "access/multixact_internal.h"
+
+typedef struct
+{
+ MultiXactId nextMXact;
+ MultiXactOffset nextOffset;
+
+ SlruSegState *offset;
+ SlruSegState *members;
+} MultiXactWriter;
+
+static MultiXactWriter *StartMultiXactWrite(MultiXactId firstMulti, MultiXactOffset firstOffset);
+static MultiXactId GetNewMultiXactId(MultiXactWriter *state, int nmembers, MultiXactOffset *offset);
+static void RecordNewMultiXact(MultiXactWriter *state,
+ MultiXactOffset offset,
+ MultiXactId multi,
+ int nmembers, MultiXactMember *members);
+static void CloseMultiXactWrite(MultiXactWriter *state);
+
+
+/*
+ * Convert pg_multixact/offset and /members to new format with 64-bit offsets.
+ */
+void
+convert_multixacts(MultiXactId *new_nxtmulti, MultiXactOffset *new_nxtmxoff)
+{
+ MultiXactWriter *new_writer;
+ MultiXactId oldest_multi = old_cluster.controldata.chkpnt_oldstMulti,
+ next_multi = old_cluster.controldata.chkpnt_nxtmulti,
+ multi;
+ OldMultiXactReader *old_reader;
+
+ if (next_multi < FirstMultiXactId)
+ next_multi = FirstMultiXactId;
+
+ old_reader = StartOldMultiXactRead();
+ new_writer = StartMultiXactWrite(oldest_multi, 1);
+
+ /*
+ * Read multixids from old files one by one, and write them back in the
+ * new format.
+ *
+ * The locking-only XIDs that may be part of multi-xids don't matter after
+ * upgrade, as there can be no transactions running across upgrade. So as
+ * a little optimization, we only read one member from each multixid: the
+ * one updating one, or if there was no update, arbitrarily the first
+ * locking xid.
+ */
+ for (multi = oldest_multi; multi != next_multi;)
+ {
+ TransactionId xid;
+ bool isupdate;
+ MultiXactMember member;
+ MultiXactId newmulti;
+ MultiXactOffset offset;
+
+ /* Read the old multixid */
+ GetOldMultiXactIdSingleMember(old_reader, multi, &xid, &isupdate);
+
+ /* Write it out in new format */
+ member.xid = xid;
+ member.status = isupdate ? MultiXactStatusUpdate : MultiXactStatusForKeyShare;
+ newmulti = GetNewMultiXactId(new_writer, 1, &offset);
+ Assert(newmulti == multi);
+ RecordNewMultiXact(new_writer, offset, multi, 1, &member);
+
+ multi++;
+ if (multi < FirstMultiXactId)
+ multi = FirstMultiXactId;
+ }
+
+ /*
+ * Update the nextMXact/Offset values in the control file to match what we
+ * wrote. The nextMXact should be unchanged, but because we ignored the
+ * locking XIDs members, the nextOffset will be different.
+ */
+ Assert(new_writer->nextMXact == next_multi);
+ *new_nxtmulti = next_multi;
+ *new_nxtmxoff = new_writer->nextOffset;
+
+ /* Release resources */
+ CloseMultiXactWrite(new_writer);
+}
+
+/* Support routines for writing the new format */
+
+static MultiXactWriter *
+StartMultiXactWrite(MultiXactId firstMulti, MultiXactOffset firstOffset)
+{
+ MultiXactWriter *state;
+ char *dir;
+
+ state = pg_malloc(sizeof(MultiXactWriter));
+ state->nextMXact = firstMulti;
+ state->nextOffset = firstOffset;
+
+ dir = psprintf("%s/pg_multixact/offsets", new_cluster.pgdata);
+ state->offset = OpenSlruWrite(dir, MultiXactIdToOffsetPage(firstMulti));
+ pg_free(dir);
+
+ dir = psprintf("%s/pg_multixact/members", new_cluster.pgdata);
+ state->members = OpenSlruWrite(dir, MXOffsetToMemberPage(1));
+ pg_free(dir);
+
+ return state;
+}
+
+static void
+CloseMultiXactWrite(MultiXactWriter *state)
+{
+ CloseSlruWrite(state->offset);
+ CloseSlruWrite(state->members);
+ pg_free(state);
+}
+
+/*
+ * Simplified copy of the corresponding server function
+ */
+static MultiXactId
+GetNewMultiXactId(MultiXactWriter *state, int nmembers, MultiXactOffset *offset)
+{
+ MultiXactId result;
+
+ /* Handle wraparound of the nextMXact counter */
+ if (state->nextMXact < FirstMultiXactId)
+ state->nextMXact = FirstMultiXactId;
+
+ /* Assign the MXID */
+ result = state->nextMXact;
+
+ /*
+ * Reserve the members space, similarly to above.
+ */
+ *offset = state->nextOffset;
+
+ /*
+ * Advance counters. As in GetNewTransactionId(), this must not happen
+ * until after file extension has succeeded!
+ *
+ * We don't care about MultiXactId wraparound here; it will be handled by
+ * the next iteration. But note that nextMXact may be InvalidMultiXactId
+ * or the first value on a segment-beginning page after this routine
+ * exits, so anyone else looking at the variable must be prepared to deal
+ * with either case. Similarly, nextOffset may be zero, but we won't use
+ * that as the actual start offset of the next multixact.
+ */
+ (state->nextMXact)++;
+
+ state->nextOffset += nmembers;
+
+ return result;
+}
+
+/*
+ * Write a new multixact with members.
+ *
+ * Simplified version of the correspoding server function.
+ */
+static void
+RecordNewMultiXact(MultiXactWriter *state, MultiXactOffset offset,
+ MultiXactId multi,
+ int nmembers, MultiXactMember *members)
+{
+ int64 pageno;
+ int64 prev_pageno;
+ int entryno;
+
+ char *buf;
+ MultiXactOffset *offptr;
+
+ pageno = MultiXactIdToOffsetPage(multi);
+ entryno = MultiXactIdToOffsetEntry(multi);
+
+ /*
+ * Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction"
+ * to complain about if there's any I/O error. This is kinda bogus, but
+ * since the errors will always give the full pathname, it should be clear
+ * enough that a MultiXactId is really involved. Perhaps someday we'll
+ * take the trouble to generalize the slru.c error reporting code.
+ */
+ buf = SlruWriteSwitchPage(state->offset, pageno);
+ offptr = (MultiXactOffset *) buf;
+ offptr += entryno;
+
+ *offptr = offset;
+
+ prev_pageno = -1;
+
+ for (int i = 0; i < nmembers; i++, offset++)
+ {
+ TransactionId *memberptr;
+ uint32 *flagsptr;
+ uint32 flagsval;
+ int bshift;
+ int flagsoff;
+ int memberoff;
+
+ Assert(members[i].status <= MultiXactStatusUpdate);
+
+ pageno = MXOffsetToMemberPage(offset);
+ memberoff = MXOffsetToMemberOffset(offset);
+ flagsoff = MXOffsetToFlagsOffset(offset);
+ bshift = MXOffsetToFlagsBitShift(offset);
+
+ if (pageno != prev_pageno)
+ {
+ buf = SlruWriteSwitchPage(state->members, pageno);
+ prev_pageno = pageno;
+ }
+
+ memberptr = (TransactionId *) (buf + memberoff);
+
+ *memberptr = members[i].xid;
+
+ flagsptr = (uint32 *) (buf + flagsoff);
+
+ flagsval = *flagsptr;
+ flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
+ flagsval |= (members[i].status << bshift);
+ *flagsptr = flagsval;
+ }
+}
diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c
index 1654e877c07..484536853a1 100644
--- a/src/bin/pg_upgrade/pg_upgrade.c
+++ b/src/bin/pg_upgrade/pg_upgrade.c
@@ -750,6 +750,9 @@ copy_xact_xlog_xid(void)
if (old_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER &&
new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
{
+ MultiXactId new_nxtmulti = old_cluster.controldata.chkpnt_nxtmulti;
+ MultiXactOffset new_nxtmxoff = old_cluster.controldata.chkpnt_nxtmxoff;
+
/*
* If the old server is before the MULTIXACTOFFSET_FORMATCHANGE_CAT_VER
* it must have 32-bit multixid offsets, thus it should be converted.
@@ -757,29 +760,11 @@ copy_xact_xlog_xid(void)
if (old_cluster.controldata.cat_ver < MULTIXACTOFFSET_FORMATCHANGE_CAT_VER &&
new_cluster.controldata.cat_ver >= MULTIXACTOFFSET_FORMATCHANGE_CAT_VER)
{
- MultiXactOffset oldest_offset,
- next_offset;
-
+ remove_new_subdir("pg_multixact/members", false);
remove_new_subdir("pg_multixact/offsets", false);
prep_status("Converting pg_multixact/offsets to 64-bit");
- oldest_offset = convert_multixact_offsets();
- check_ok();
-
- remove_new_subdir("pg_multixact/members", false);
- prep_status("Converting pg_multixact/members");
- convert_multixact_members(oldest_offset);
+ convert_multixacts(&new_nxtmulti, &new_nxtmxoff);
check_ok();
-
- next_offset = old_cluster.controldata.chkpnt_nxtmxoff;
- if (oldest_offset)
- {
- if (next_offset < oldest_offset)
- next_offset += ((MultiXactOffset) 1 << 32) - 1;
-
- next_offset -= oldest_offset - 1;
-
- old_cluster.controldata.chkpnt_nxtmxoff = next_offset;
- }
}
else
{
@@ -796,8 +781,8 @@ copy_xact_xlog_xid(void)
exec_prog(UTILITY_LOG_FILE, NULL, true, true,
"\"%s/pg_resetwal\" -O %llu -m %u,%u \"%s\"",
new_cluster.bindir,
- (unsigned long long) old_cluster.controldata.chkpnt_nxtmxoff,
- old_cluster.controldata.chkpnt_nxtmulti,
+ (unsigned long long) new_nxtmxoff,
+ new_nxtmulti,
old_cluster.controldata.chkpnt_oldstMulti,
new_cluster.pgdata);
check_ok();
diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h
index 2c85ec1e949..c13293b4add 100644
--- a/src/bin/pg_upgrade/pg_upgrade.h
+++ b/src/bin/pg_upgrade/pg_upgrade.h
@@ -523,7 +523,6 @@ typedef struct
char path[MAXPGPATH];
} UpgradeTaskReport;
-/* segresize.c */
+/* multixact_rewrite.c */
-MultiXactOffset convert_multixact_offsets(void);
-void convert_multixact_members(MultiXactOffset oldest_offset);
+void convert_multixacts(MultiXactId *new_nxtmulti, MultiXactOffset *new_nxtmxoff);
diff --git a/src/bin/pg_upgrade/segresize.c b/src/bin/pg_upgrade/segresize.c
deleted file mode 100644
index 73064c77deb..00000000000
--- a/src/bin/pg_upgrade/segresize.c
+++ /dev/null
@@ -1,527 +0,0 @@
-/*
- * segresize.c
- *
- * SLRU segment resize utility
- *
- * Copyright (c) 2024, PostgreSQL Global Development Group
- * src/bin/pg_upgrade/segresize.c
- */
-
-#include "postgres_fe.h"
-
-#include "pg_upgrade.h"
-#include "access/multixact.h"
-
-/* See slru.h */
-#define SLRU_PAGES_PER_SEGMENT 32
-
-/*
- * Some kind of iterator associated with a particular SLRU segment. The idea is
- * to specify the segment and page number and then move through the pages.
- */
-typedef struct SlruSegState
-{
- char *dir;
- char *fn;
- FILE *file;
- int64 segno;
- uint64 pageno;
- bool leading_gap;
-} SlruSegState;
-
-/*
- * Mirrors the SlruFileName from slru.c
- */
-static inline char *
-SlruFileName(SlruSegState *state)
-{
- Assert(state->segno >= 0 && state->segno <= INT64CONST(0xFFFFFF));
- return psprintf("%s/%04X", state->dir, (unsigned int) state->segno);
-}
-
-/*
- * Create new SLRU segment file.
- */
-static void
-create_segment(SlruSegState *state)
-{
- Assert(state->fn == NULL);
- Assert(state->file == NULL);
-
- state->fn = SlruFileName(state);
- state->file = fopen(state->fn, "wb");
- if (!state->file)
- pg_fatal("could not create file \"%s\": %m", state->fn);
-}
-
-/*
- * Open existing SLRU segment file.
- */
-static void
-open_segment(SlruSegState *state)
-{
- Assert(state->fn == NULL);
- Assert(state->file == NULL);
-
- state->fn = SlruFileName(state);
- state->file = fopen(state->fn, "rb");
- if (!state->file)
- pg_fatal("could not open file \"%s\": %m", state->fn);
-}
-
-/*
- * Close SLRU segment file.
- */
-static void
-close_segment(SlruSegState *state)
-{
- if (state->file)
- {
- fclose(state->file);
- state->file = NULL;
- }
-
- if (state->fn)
- {
- pfree(state->fn);
- state->fn = NULL;
- }
-}
-
-/*
- * Read next page from the old 32-bit offset segment file.
- */
-static int
-read_old_segment_page(SlruSegState *state, void *buf, bool *empty)
-{
- int len;
-
- /* Open next segment file, if needed. */
- if (!state->fn)
- {
- if (!state->segno)
- state->leading_gap = true;
-
- open_segment(state);
-
- /* Set position to the needed page. */
- if (state->pageno > 0 &&
- fseek(state->file, state->pageno * BLCKSZ, SEEK_SET))
- {
- close_segment(state);
- }
- }
-
- if (state->file)
- {
- /* Segment file do exists, read page from it. */
- state->leading_gap = false;
-
- len = fread(buf, sizeof(char), BLCKSZ, state->file);
-
- /* Are we done or was there an error? */
- if (len <= 0)
- {
- if (ferror(state->file))
- pg_fatal("error reading file \"%s\": %m", state->fn);
-
- if (feof(state->file))
- {
- *empty = true;
- len = -1;
-
- close_segment(state);
- }
- }
- else
- *empty = false;
- }
- else if (!state->leading_gap)
- {
- /* We reached the last segment. */
- len = -1;
- *empty = true;
- }
- else
- {
- /* Skip few first segments if they were frozen and removed. */
- len = BLCKSZ;
- *empty = true;
- }
-
- if (++state->pageno >= SLRU_PAGES_PER_SEGMENT)
- {
- /* Start a new segment. */
- state->segno++;
- state->pageno = 0;
-
- close_segment(state);
- }
-
- return len;
-}
-
-/*
- * Write next page to the new 64-bit offset segment file.
- */
-static void
-write_new_segment_page(SlruSegState *state, void *buf)
-{
- /*
- * Create a new segment file if we still didn't. Creation is
- * postponed until the first non-empty page is found. This helps
- * not to create completely empty segments.
- */
- if (!state->file)
- {
- create_segment(state);
-
- /* Write zeroes to the previously skipped prefix. */
- if (state->pageno > 0)
- {
- char zerobuf[BLCKSZ] = {0};
-
- for (int64 i = 0; i < state->pageno; i++)
- {
- if (fwrite(zerobuf, sizeof(char), BLCKSZ, state->file) != BLCKSZ)
- pg_fatal("could not write file \"%s\": %m", state->fn);
- }
- }
- }
-
- /* Write page to the new segment (if it was created). */
- if (state->file)
- {
- if (fwrite(buf, sizeof(char), BLCKSZ, state->file) != BLCKSZ)
- pg_fatal("could not write file \"%s\": %m", state->fn);
- }
-
- /*
- * Did we reach the maximum page number? Then close segment file
- * and create a new one on the next iteration.
- */
- if (++state->pageno >= SLRU_PAGES_PER_SEGMENT)
- {
- /* Start a new segment. */
- state->segno++;
- state->pageno = 0;
-
- close_segment(state);
- }
-}
-
-typedef uint32 MultiXactOffsetOld;
-
-#define MaxMultiXactOffsetOld ((MultiXactOffsetOld) 0xFFFFFFFF)
-
-#define MULTIXACT_OFFSETS_PER_PAGE_OLD (BLCKSZ / sizeof(MultiXactOffsetOld))
-#define MULTIXACT_OFFSETS_PER_PAGE_NEW (BLCKSZ / sizeof(MultiXactOffset))
-
-/*
- * Convert pg_multixact/offsets segments and return oldest multi offset.
- */
-MultiXactOffset
-convert_multixact_offsets(void)
-{
- SlruSegState oldseg = {0},
- newseg = {0};
- MultiXactOffsetOld oldbuf[MULTIXACT_OFFSETS_PER_PAGE_OLD] = {0};
- MultiXactOffset newbuf[MULTIXACT_OFFSETS_PER_PAGE_NEW] = {0},
- oldest_offset = 0;
- uint64 oldest_multi = old_cluster.controldata.chkpnt_oldstMulti,
- next_multi = old_cluster.controldata.chkpnt_nxtmulti,
- multi,
- old_entry,
- new_entry;
- bool oldest_offset_known = false;
-
- oldseg.dir = psprintf("%s/pg_multixact/offsets", old_cluster.pgdata);
- newseg.dir = psprintf("%s/pg_multixact/offsets", new_cluster.pgdata);
-
- old_entry = oldest_multi % MULTIXACT_OFFSETS_PER_PAGE_OLD;
- oldseg.pageno = oldest_multi / MULTIXACT_OFFSETS_PER_PAGE_OLD;
- oldseg.segno = oldseg.pageno / SLRU_PAGES_PER_SEGMENT;
- oldseg.pageno %= SLRU_PAGES_PER_SEGMENT;
-
- new_entry = oldest_multi % MULTIXACT_OFFSETS_PER_PAGE_NEW;
- newseg.pageno = oldest_multi / MULTIXACT_OFFSETS_PER_PAGE_NEW;
- newseg.segno = newseg.pageno / SLRU_PAGES_PER_SEGMENT;
- newseg.pageno %= SLRU_PAGES_PER_SEGMENT;
-
- if (next_multi < oldest_multi)
- next_multi += (uint64) 1 << 32; /* wraparound */
-
- /* Copy multi offsets reading only needed segment pages */
- for (multi = oldest_multi; multi < next_multi; old_entry = 0)
- {
- int oldlen;
- bool empty;
-
- /* Handle possible segment wraparound */
-#define OLD_OFFSET_SEGNO_MAX \
- (MaxMultiXactId / MULTIXACT_OFFSETS_PER_PAGE_OLD / SLRU_PAGES_PER_SEGMENT)
- if (oldseg.segno > OLD_OFFSET_SEGNO_MAX)
- {
- oldseg.segno = 0;
- oldseg.pageno = 0;
- }
-
- oldlen = read_old_segment_page(&oldseg, oldbuf, &empty);
- if (empty || oldlen != BLCKSZ)
- pg_fatal("cannot read page %llu from file \"%s\": %m",
- (unsigned long long) oldseg.pageno, oldseg.fn);
-
- /* Save oldest multi offset */
- if (!oldest_offset_known)
- {
- oldest_offset = oldbuf[old_entry];
- oldest_offset_known = true;
- }
-
- /* Skip wrapped-around invalid MultiXactIds */
- if (multi == (uint64) 1 << 32)
- {
- Assert(oldseg.segno == 0);
- Assert(oldseg.pageno == 1);
- Assert(old_entry == 0);
- Assert(new_entry == 0);
-
- multi += FirstMultiXactId;
- old_entry = FirstMultiXactId;
- new_entry = FirstMultiXactId;
- }
-
- /* Copy entries to the new page */
- for (; multi < next_multi && old_entry < MULTIXACT_OFFSETS_PER_PAGE_OLD;
- multi++, old_entry++)
- {
- MultiXactOffset offset = oldbuf[old_entry];
-
- /* Handle possible offset wraparound (1 becomes 2^32) */
- if (offset < oldest_offset)
- offset += ((uint64) 1 << 32) - 1;
-
- /* Subtract oldest_offset, so new offsets will start from 1 */
- newbuf[new_entry++] = offset - oldest_offset + 1;
-
- if (new_entry >= MULTIXACT_OFFSETS_PER_PAGE_NEW)
- {
- /* Handle possible segment wraparound */
-#define NEW_OFFSET_SEGNO_MAX \
- (MaxMultiXactId / MULTIXACT_OFFSETS_PER_PAGE_NEW / SLRU_PAGES_PER_SEGMENT)
- if (newseg.segno > NEW_OFFSET_SEGNO_MAX)
- {
- newseg.segno = 0;
- newseg.pageno = 0;
- }
-
- /* Write new page */
- write_new_segment_page(&newseg, newbuf);
- new_entry = 0;
- }
- }
- }
-
- /* Write the last incomplete page */
- if (new_entry > 0 || oldest_multi == next_multi)
- {
- memset(&newbuf[new_entry], 0,
- sizeof(newbuf[0]) * (MULTIXACT_OFFSETS_PER_PAGE_NEW - new_entry));
- write_new_segment_page(&newseg, newbuf);
- }
-
- /* Use next_offset as oldest_offset, if oldest_multi == next_multi */
- if (!oldest_offset_known)
- {
- Assert(oldest_multi == next_multi);
- oldest_offset = (MultiXactOffset) old_cluster.controldata.chkpnt_nxtmxoff;
- }
-
- /* Release resources */
- close_segment(&oldseg);
- close_segment(&newseg);
-
- pfree(oldseg.dir);
- pfree(newseg.dir);
-
- return oldest_offset;
-}
-
-#define MXACT_MEMBERS_FLAG_BYTES 1
-
-#define MULTIXACT_MEMBERS_PER_GROUP 4
-#define MULTIXACT_MEMBERGROUP_SIZE \
- (MULTIXACT_MEMBERS_PER_GROUP * (sizeof(TransactionId) + MXACT_MEMBERS_FLAG_BYTES))
-#define MULTIXACT_MEMBERGROUPS_PER_PAGE \
- (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
-
-#define MULTIXACT_MEMBERS_PER_PAGE \
- (MULTIXACT_MEMBERS_PER_GROUP * MULTIXACT_MEMBERGROUPS_PER_PAGE)
-#define MULTIXACT_MEMBER_FLAG_BYTES_PER_GROUP \
- (MXACT_MEMBERS_FLAG_BYTES * MULTIXACT_MEMBERS_PER_GROUP)
-
-typedef struct MultiXactMembersCtx
-{
- SlruSegState seg;
- char buf[BLCKSZ];
- int group;
- int member;
- char *flag;
- TransactionId *xid;
-} MultiXactMembersCtx;
-
-static void
-MultiXactMembersCtxInit(MultiXactMembersCtx *ctx)
-{
- ctx->seg.dir = psprintf("%s/pg_multixact/members", new_cluster.pgdata);
-
- ctx->group = 0;
- ctx->member = 1; /* skip invalid zero offset */
-
- ctx->flag = (char *) ctx->buf + ctx->group * MULTIXACT_MEMBERGROUP_SIZE;
- ctx->xid = (TransactionId *)(ctx->flag + MXACT_MEMBERS_FLAG_BYTES * MULTIXACT_MEMBERS_PER_GROUP);
-
- ctx->flag += ctx->member;
- ctx->xid += ctx->member;
-}
-
-static void
-MultiXactMembersCtxAdd(MultiXactMembersCtx *ctx, char flag, TransactionId xid)
-{
- /* Copy member's xid and flags to the new page */
- *ctx->flag++ = flag;
- *ctx->xid++ = xid;
-
- if (++ctx->member < MULTIXACT_MEMBERS_PER_GROUP)
- return;
-
- /* Start next member group */
- ctx->member = 0;
-
- if (++ctx->group >= MULTIXACT_MEMBERGROUPS_PER_PAGE)
- {
- /* Write current page and start new */
- write_new_segment_page(&ctx->seg, ctx->buf);
-
- ctx->group = 0;
- memset(ctx->buf, 0, BLCKSZ);
- }
-
- ctx->flag = (char *) ctx->buf + ctx->group * MULTIXACT_MEMBERGROUP_SIZE;
- ctx->xid = (TransactionId *)(ctx->flag + MXACT_MEMBERS_FLAG_BYTES * MULTIXACT_MEMBERS_PER_GROUP);
-}
-
-static void
-MultiXactMembersCtxFinit(MultiXactMembersCtx *ctx)
-{
- if (ctx->flag > (char *) ctx->buf)
- write_new_segment_page(&ctx->seg, ctx->buf);
-
- close_segment(&ctx->seg);
-
- pfree(ctx->seg.dir);
-}
-
-/*
- * Convert pg_multixact/members segments, offsets will start from 1.
- *
- */
-void
-convert_multixact_members(MultiXactOffset oldest_offset)
-{
- MultiXactOffset next_offset,
- offset;
- SlruSegState oldseg = {0};
- char oldbuf[BLCKSZ] = {0};
- int oldidx;
- MultiXactMembersCtx newctx = {0};
-
- oldseg.dir = psprintf("%s/pg_multixact/members", old_cluster.pgdata);
-
- next_offset = (MultiXactOffset) old_cluster.controldata.chkpnt_nxtmxoff;
- if (next_offset < oldest_offset)
- next_offset += ((uint64) 1 << 32) - 1;
-
- /* Initialize the old starting position */
- oldseg.pageno = oldest_offset / MULTIXACT_MEMBERS_PER_PAGE;
- oldseg.segno = oldseg.pageno / SLRU_PAGES_PER_SEGMENT;
- oldseg.pageno %= SLRU_PAGES_PER_SEGMENT;
-
- /* Initialize new starting position */
- MultiXactMembersCtxInit(&newctx);
-
- /* Iterate through the original directory */
- oldidx = oldest_offset % MULTIXACT_MEMBERS_PER_PAGE;
- for (offset = oldest_offset; offset < next_offset;)
- {
- bool empty;
- int oldlen;
- int ngroups;
- int oldgroup;
- int oldmember;
-
- oldlen = read_old_segment_page(&oldseg, oldbuf, &empty);
- if (empty || oldlen != BLCKSZ)
- pg_fatal("cannot read page %llu from file \"%s\": %m",
- (unsigned long long) oldseg.pageno, oldseg.fn);
-
- /* Iterate through the old member groups */
- ngroups = oldlen / MULTIXACT_MEMBERGROUP_SIZE;
- oldmember = oldidx % MULTIXACT_MEMBERS_PER_GROUP;
- oldgroup = oldidx / MULTIXACT_MEMBERS_PER_GROUP;
- while (oldgroup < ngroups && offset < next_offset)
- {
- char *oldflag;
- TransactionId *oldxid;
- int i;
-
- oldflag = (char *) oldbuf + oldgroup * MULTIXACT_MEMBERGROUP_SIZE;
- oldxid = (TransactionId *)(oldflag + MULTIXACT_MEMBER_FLAG_BYTES_PER_GROUP);
-
- oldxid += oldmember;
- oldflag += oldmember;
-
- /* Iterate through the old members */
- for (i = oldmember;
- i < MULTIXACT_MEMBERS_PER_GROUP && offset < next_offset;
- i++)
- {
- MultiXactMembersCtxAdd(&newctx, *oldflag++, *oldxid++);
-
- if (++offset == (uint64) 1 << 32)
- {
- Assert(i == MaxMultiXactOffsetOld % MULTIXACT_MEMBERS_PER_GROUP);
- goto wraparound;
- }
- }
-
- oldgroup++;
- oldmember = 0;
- }
-
- oldidx = 0;
-
- continue;
-
-wraparound:
-#define SEGNO_MAX MaxMultiXactOffsetOld / MULTIXACT_MEMBERS_PER_PAGE / SLRU_PAGES_PER_SEGMENT
-#define PAGENO_MAX MaxMultiXactOffsetOld / MULTIXACT_MEMBERS_PER_PAGE % SLRU_PAGES_PER_SEGMENT
- Assert((oldseg.segno == SEGNO_MAX && oldseg.pageno == PAGENO_MAX + 1) ||
- (oldseg.segno == SEGNO_MAX + 1 && oldseg.pageno == 0));
-
- /* Switch to segment 0000 */
- close_segment(&oldseg);
- oldseg.segno = 0;
- oldseg.pageno = 0;
-
- /* skip invalid zero multi offset */
- oldidx = 1;
- }
-
- MultiXactMembersCtxFinit(&newctx);
-
- /* Release resources */
- close_segment(&oldseg);
-
- pfree(oldseg.dir);
-}
diff --git a/src/bin/pg_upgrade/slru_io.c b/src/bin/pg_upgrade/slru_io.c
new file mode 100644
index 00000000000..152ecfdce59
--- /dev/null
+++ b/src/bin/pg_upgrade/slru_io.c
@@ -0,0 +1,214 @@
+/*
+ * slru_io.c
+ *
+ * Routines for reading and writing SLRU files during upgrade.
+ *
+ * Copyright (c) 2024, PostgreSQL Global Development Group
+ * src/bin/pg_upgrade/slru_io.c
+ */
+
+#include "postgres_fe.h"
+
+#include <fcntl.h>
+
+#include "pg_upgrade.h"
+#include "slru_io.h"
+
+#include "common/file_perm.h"
+#include "common/file_utils.h"
+#include "port/pg_iovec.h"
+
+/* See slru.h */
+#define SLRU_PAGES_PER_SEGMENT 32
+
+/*
+ * State for reading or writing an SLRU, with a one page buffer.
+ */
+typedef struct SlruSegState
+{
+ bool writing;
+
+ char *dir;
+ char *fn;
+ int fd;
+ int64 segno;
+ uint64 pageno;
+
+ PGAlignedBlock buf;
+} SlruSegState;
+
+static void SlruFlush(SlruSegState *state);
+
+
+SlruSegState *
+OpenSlruRead(char *dir)
+{
+ SlruSegState *state;
+
+ state = pg_malloc(sizeof(SlruSegState));
+ state->writing = false;
+ state->segno = -1;
+ state->pageno = 0;
+ state->dir = pstrdup(dir);
+ state->fd = -1;
+ state->fn = NULL;
+
+ return state;
+}
+
+void
+CloseSlruRead(SlruSegState *state)
+{
+ Assert(!state->writing);
+ close(state->fd);
+ pg_free(state);
+}
+
+SlruSegState *
+OpenSlruWrite(char *dir, int64 startPageno)
+{
+ SlruSegState *state;
+
+ state = pg_malloc(sizeof(SlruSegState));
+ state->writing = true;
+ state->segno = -1;
+ state->pageno = 0;
+ state->dir = pstrdup(dir);
+ state->fd = -1;
+ state->fn = NULL;
+
+ return state;
+}
+
+void
+CloseSlruWrite(SlruSegState *state)
+{
+ Assert(state->writing);
+ SlruFlush(state);
+
+ close(state->fd);
+ pg_free(state);
+}
+
+static void
+SlruFlush(SlruSegState *state)
+{
+ struct iovec iovec = {
+ .iov_base = &state->buf,
+ .iov_len = BLCKSZ,
+ };
+ off_t offset;
+
+ if (state->segno == -1)
+ return;
+
+ offset = (state->pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ;
+
+ if (pg_pwritev_with_retry(state->fd, &iovec, 1, offset) < 0)
+ pg_fatal("could not write file \"%s\": %m", state->fn);
+}
+
+/*
+ * Open the given page for writing.
+ *
+ * NOTE: This uses O_EXCL when stepping to a new segment, so this assumes that
+ * each segment is written in full before moving on to next one. This
+ * limitation would be easy to lift if needed, but it fits the usage pattern
+ * of current callers.
+ */
+char *
+SlruWriteSwitchPage(SlruSegState *state, uint64 pageno)
+{
+ int64 segno = pageno / SLRU_PAGES_PER_SEGMENT;
+ off_t offset;
+
+ if (state->segno != -1 && pageno == state->pageno)
+ return state->buf.data;
+
+ segno = pageno / SLRU_PAGES_PER_SEGMENT;
+ offset = (pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ;
+
+ SlruFlush(state);
+ memset(state->buf.data, 0, BLCKSZ);
+
+ if (segno != state->segno)
+ {
+ if (state->segno != -1)
+ {
+ close(state->fd);
+ state->fd = -1;
+ pg_free(state->fn);
+ state->fn = NULL;
+ }
+
+ /* Create the segment */
+ state->fn = psprintf("%s/%04X", state->dir, (unsigned int) segno);
+ if ((state->fd = open(state->fn, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+ pg_file_create_mode)) < 0)
+ {
+ pg_fatal("could not create file \"%s\": %m", state->fn);
+ }
+ state->segno = segno;
+
+ if (offset > 0)
+ {
+ if (pg_pwrite_zeros(state->fd, offset, 0) < 0)
+ pg_fatal("could not write file \"%s\": %m", state->fn);
+ }
+ }
+
+ state->pageno = pageno;
+ return state->buf.data;
+}
+
+/*
+ * Open given page for reading.
+ *
+ * Reading can be done in random order.
+ */
+char *
+SlruReadSwitchPage(SlruSegState *state, uint64 pageno)
+{
+ int64 segno;
+
+ if (state->segno != -1 && pageno == state->pageno)
+ return state->buf.data;
+
+ segno = pageno / SLRU_PAGES_PER_SEGMENT;
+
+ if (segno != state->segno)
+ {
+ if (state->segno != -1)
+ {
+ close(state->fd);
+ state->fd = -1;
+ pg_free(state->fn);
+ state->fn = NULL;
+ }
+
+ /* Open new segment */
+ state->fn = psprintf("%s/%04X", state->dir, (unsigned int) segno);
+ if ((state->fd = open(state->fn, O_RDONLY | PG_BINARY, 0)) < 0)
+ {
+ pg_fatal("could not open file \"%s\": %m", state->fn);
+ }
+ state->segno = segno;
+ }
+
+ {
+ struct iovec iovec = {
+ .iov_base = &state->buf,
+ .iov_len = BLCKSZ,
+ };
+ off_t offset;
+
+ offset = (pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ;
+
+ if (pg_preadv(state->fd, &iovec, 1, offset) < 0)
+ pg_fatal("could not read file \"%s\": %m", state->fn);
+
+ state->pageno = pageno;
+ }
+
+ return state->buf.data;
+}
diff --git a/src/bin/pg_upgrade/slru_io.h b/src/bin/pg_upgrade/slru_io.h
new file mode 100644
index 00000000000..e1a9c063139
--- /dev/null
+++ b/src/bin/pg_upgrade/slru_io.h
@@ -0,0 +1,23 @@
+/*
+ * slru_io.h
+ *
+ * Copyright (c) 2010-2024, PostgreSQL Global Development Group
+ * src/bin/pg_upgrade/slru_io.h
+ */
+
+/* XXX: copied from slru.h */
+#define SLRU_PAGES_PER_SEGMENT 32
+
+/*
+ * Some kind of iterator associated with a particular SLRU segment. The idea is
+ * to specify the segment and page number and then move through the pages.
+ */
+typedef struct SlruSegState SlruSegState;
+
+extern SlruSegState *OpenSlruRead(char *dir);
+extern void CloseSlruRead(SlruSegState *state);
+extern char *SlruReadSwitchPage(SlruSegState *state, uint64 pageno);
+
+extern SlruSegState *OpenSlruWrite(char *dir, int64 startPageno);
+extern void CloseSlruWrite(SlruSegState *state);
+extern char *SlruWriteSwitchPage(SlruSegState *state, uint64 pageno);
--
2.39.5