v3-0001-Optimize-SnapBuild-by-maintaining-committed.xip-i.patch
application/octet-stream
Filename: v3-0001-Optimize-SnapBuild-by-maintaining-committed.xip-i.patch
Type: application/octet-stream
Part: 0
From a7c8d1da5d0496b2a7d05d52e37511a37075e13d Mon Sep 17 00:00:00 2001
From: alterego655 <824662526@qq.com>
Date: Tue, 16 Dec 2025 15:15:14 +0800
Subject: [PATCH v3 1/2] Optimize SnapBuild by maintaining committed.xip in
sorted order
Maintain committed.xip in xidComparator order using a two-phase approach
that balances performance during slot initialization with efficiency during
steady-state decoding:
Pre-CONSISTENT phase:
- Use fast O(1) append for unsorted XIDs
- Sort once at transition to CONSISTENT state
- Avoids merge overhead during initial snapshot building
Post-CONSISTENT phase:
- Use per-commit batch insertion with sorted merge
- Collect all relevant XIDs for each commit
- Sort the batch once: O(m log m)
- Reverse-merge into the global array: O(N + m)
With this change, snapshot builds can skip the qsort() step and simply
memcpy the sorted array. While per-commit work increases from O(m) (plain
append) to O(m log m + N) (sort-and-merge) after CONSISTENT, eliminating
repeated O(N log N) sorts at each snapshot build significantly reduces
overall steady-state cost once CONSISTENT is reached and snapshots are
built frequently.
---
src/backend/replication/logical/snapbuild.c | 164 +++++++++++++++++--
src/include/replication/snapbuild_internal.h | 12 +-
2 files changed, 151 insertions(+), 25 deletions(-)
diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
index d6ab1e017eb..e0d7d3e0ae8 100644
--- a/src/backend/replication/logical/snapbuild.c
+++ b/src/backend/replication/logical/snapbuild.c
@@ -407,8 +407,25 @@ SnapBuildBuildSnapshot(SnapBuild *builder)
builder->committed.xip,
builder->committed.xcnt * sizeof(TransactionId));
- /* sort so we can bsearch() */
- qsort(snapshot->xip, snapshot->xcnt, sizeof(TransactionId), xidComparator);
+ if (builder->state < SNAPBUILD_CONSISTENT)
+ {
+ /*
+ * Pre-CONSISTENT: committed.xip is unsorted, sort the snapshot copy.
+ */
+ if (snapshot->xcnt > 1)
+ qsort(snapshot->xip, snapshot->xcnt,
+ sizeof(TransactionId), xidComparator);
+ }
+#ifdef USE_ASSERT_CHECKING
+ else
+ {
+ /*
+ * Post-CONSISTENT: committed.xip should already be sorted, verify.
+ */
+ for (int i = 1; i < snapshot->xcnt; i++)
+ Assert(snapshot->xip[i - 1] < snapshot->xip[i]);
+ }
+#endif
/*
* Initially, subxip is empty, i.e. it's a snapshot to be used by
@@ -822,17 +839,33 @@ SnapBuildDistributeSnapshotAndInval(SnapBuild *builder, XLogRecPtr lsn, Transact
}
/*
- * Keep track of a new catalog changing transaction that has committed.
+ * Keep track of new catalog changing transactions that have committed.
+ *
+ * Before reaching CONSISTENT state, we use fast O(1) append since the array
+ * doesn't need to be sorted yet. After CONSISTENT, we maintain sorted order
+ * using a merge approach to avoid repeated full-array sorts at snapshot build.
*/
static void
-SnapBuildAddCommittedTxn(SnapBuild *builder, TransactionId xid)
+SnapBuildAddCommittedTxns(SnapBuild *builder,
+ const TransactionId *batch_xids,
+ int batch_cnt)
{
- Assert(TransactionIdIsValid(xid));
+ TransactionId *committed_xids;
+ size_t old_xcnt;
+
+ old_xcnt = builder->committed.xcnt;
- if (builder->committed.xcnt == builder->committed.xcnt_space)
+ /* Ensure we have space for all elements */
+ if (old_xcnt + batch_cnt > builder->committed.xcnt_space)
{
builder->committed.xcnt_space = builder->committed.xcnt_space * 2 + 1;
+ /*
+ * Repeat if we need more than 2x current space.
+ */
+ while (old_xcnt + batch_cnt > builder->committed.xcnt_space)
+ builder->committed.xcnt_space = builder->committed.xcnt_space * 2 + 1;
+
elog(DEBUG1, "increasing space for committed transactions to %u",
(uint32) builder->committed.xcnt_space);
@@ -840,12 +873,57 @@ SnapBuildAddCommittedTxn(SnapBuild *builder, TransactionId xid)
builder->committed.xcnt_space * sizeof(TransactionId));
}
+ committed_xids = builder->committed.xip;
+
/*
- * TODO: It might make sense to keep the array sorted here instead of
- * doing it every time we build a new snapshot. On the other hand this
- * gets called repeatedly when a transaction with subtransactions commits.
+ * Before CONSISTENT state, just append unsorted for O(1) performance. The
+ * array will be sorted once when transitioning to CONSISTENT.
*/
- builder->committed.xip[builder->committed.xcnt++] = xid;
+ if (builder->state < SNAPBUILD_CONSISTENT)
+ {
+ for (int i = 0; i < batch_cnt; i++)
+ {
+ Assert(TransactionIdIsValid(batch_xids[i]));
+ committed_xids[old_xcnt++] = batch_xids[i];
+ }
+ builder->committed.xcnt = old_xcnt;
+ }
+ else
+ {
+ /*
+ * After CONSISTENT, maintain sorted order via merge. Merge from the
+ * end to avoid overwriting unread data.
+ */
+ int old_idx = old_xcnt - 1;
+ int batch_idx = batch_cnt - 1;
+ int write_idx = old_xcnt + batch_cnt - 1;
+
+ while (old_idx >= 0 && batch_idx >= 0)
+ {
+ Assert(TransactionIdIsValid(batch_xids[batch_idx]));
+
+ if (committed_xids[old_idx] > batch_xids[batch_idx])
+ {
+ committed_xids[write_idx--] = committed_xids[old_idx--];
+ }
+ else
+ {
+ /* Duplicates should never occur */
+ Assert(committed_xids[old_idx] != batch_xids[batch_idx]);
+ committed_xids[write_idx--] = batch_xids[batch_idx--];
+ }
+ }
+
+ /* Copy any remaining batch elements */
+ while (batch_idx >= 0)
+ {
+ Assert(TransactionIdIsValid(batch_xids[batch_idx]));
+ committed_xids[write_idx--] = batch_xids[batch_idx--];
+ }
+
+ /* Old elements that weren't moved are already in correct position */
+ builder->committed.xcnt = old_xcnt + batch_cnt;
+ }
}
/*
@@ -947,6 +1025,13 @@ SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid,
TransactionId xmax = xid;
+ /*
+ * Collect XIDs that need tracking into a batch. We'll sort and merge
+ * them into committed.xip in one pass at the end.
+ */
+ TransactionId *batch_xids = NULL;
+ int batch_cnt = 0;
+
/*
* Transactions preceding BUILDING_SNAPSHOT will neither be decoded, nor
* will they be part of a snapshot. So we don't need to record anything.
@@ -977,6 +1062,12 @@ SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid,
}
}
+ if (nsubxacts > 0 || builder->building_full_snapshot ||
+ SnapBuildXidHasCatalogChanges(builder, xid, xinfo))
+ {
+ batch_xids = palloc((nsubxacts + 1) * sizeof(TransactionId));
+ }
+
for (nxact = 0; nxact < nsubxacts; nxact++)
{
TransactionId subxid = subxacts[nxact];
@@ -993,7 +1084,7 @@ SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid,
elog(DEBUG1, "found subtransaction %u:%u with catalog changes",
xid, subxid);
- SnapBuildAddCommittedTxn(builder, subxid);
+ batch_xids[batch_cnt++] = subxid;
if (NormalTransactionIdFollows(subxid, xmax))
xmax = subxid;
@@ -1007,7 +1098,7 @@ SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid,
*/
else if (needs_timetravel)
{
- SnapBuildAddCommittedTxn(builder, subxid);
+ batch_xids[batch_cnt++] = subxid;
if (NormalTransactionIdFollows(subxid, xmax))
xmax = subxid;
}
@@ -1020,7 +1111,7 @@ SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid,
xid);
needs_snapshot = true;
needs_timetravel = true;
- SnapBuildAddCommittedTxn(builder, xid);
+ batch_xids[batch_cnt++] = xid;
}
else if (sub_needs_timetravel)
{
@@ -1028,15 +1119,33 @@ SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid,
elog(DEBUG2, "forced transaction %u to do timetravel due to one of its subtransactions",
xid);
needs_timetravel = true;
- SnapBuildAddCommittedTxn(builder, xid);
+ batch_xids[batch_cnt++] = xid;
}
else if (needs_timetravel)
{
elog(DEBUG2, "forced transaction %u to do timetravel", xid);
- SnapBuildAddCommittedTxn(builder, xid);
+ batch_xids[batch_cnt++] = xid;
+ }
+
+ /*
+ * Sort and merge the batch into committed.xip. This maintains the
+ * invariant that committed.xip is globally sorted by raw uint32 order
+ * (xidComparator).
+ */
+ if (batch_cnt > 0)
+ {
+ /* Sort by raw uint32 order */
+ qsort(batch_xids, batch_cnt, sizeof(TransactionId), xidComparator);
+
+ /* Merge into global array */
+ SnapBuildAddCommittedTxns(builder, batch_xids, batch_cnt);
}
+ /* Free batch array if allocated (even if nothing was added to it) */
+ if (batch_xids != NULL)
+ pfree(batch_xids);
+
if (!needs_timetravel)
{
/* record that we cannot export a general snapshot anymore */
@@ -1305,6 +1414,15 @@ SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *runn
Assert(TransactionIdIsNormal(builder->xmin));
Assert(TransactionIdIsNormal(builder->xmax));
+ /*
+ * Sort committed.xip before transitioning to CONSISTENT. During the
+ * pre-CONSISTENT phase, XIDs were appended unsorted for performance.
+ * Now we need sorted order for efficient snapshot building.
+ */
+ if (builder->committed.xcnt > 1)
+ qsort(builder->committed.xip, builder->committed.xcnt,
+ sizeof(TransactionId), xidComparator);
+
builder->state = SNAPBUILD_CONSISTENT;
builder->next_phase_at = InvalidTransactionId;
@@ -1402,6 +1520,15 @@ SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *runn
TransactionIdPrecedesOrEquals(builder->next_phase_at,
running->oldestRunningXid))
{
+ /*
+ * Sort committed.xip before transitioning to CONSISTENT. During the
+ * pre-CONSISTENT phase, XIDs were appended unsorted for performance.
+ * Now we need sorted order for efficient snapshot building.
+ */
+ if (builder->committed.xcnt > 1)
+ qsort(builder->committed.xip, builder->committed.xcnt,
+ sizeof(TransactionId), xidComparator);
+
builder->state = SNAPBUILD_CONSISTENT;
builder->next_phase_at = InvalidTransactionId;
@@ -1889,6 +2016,13 @@ SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn)
pfree(builder->committed.xip);
builder->committed.xcnt_space = ondisk.builder.committed.xcnt;
builder->committed.xip = ondisk.builder.committed.xip;
+
+ /*
+ * Sort the restored array to ensure it's in xidComparator order. Old
+ * snapshot files may have been written with unsorted arrays.
+ */
+ qsort(builder->committed.xip, builder->committed.xcnt,
+ sizeof(TransactionId), xidComparator);
}
ondisk.builder.committed.xip = NULL;
diff --git a/src/include/replication/snapbuild_internal.h b/src/include/replication/snapbuild_internal.h
index 3b915dc8793..164160e5e5e 100644
--- a/src/include/replication/snapbuild_internal.h
+++ b/src/include/replication/snapbuild_internal.h
@@ -115,16 +115,8 @@ struct SnapBuild
/*
* Array of committed transactions that have modified the catalog.
*
- * As this array is frequently modified we do *not* keep it in
- * xidComparator order. Instead we sort the array when building &
- * distributing a snapshot.
- *
- * TODO: It's unclear whether that reasoning has much merit. Every
- * time we add something here after becoming consistent will also
- * require distributing a snapshot. Storing them sorted would
- * potentially also make it easier to purge (but more complicated wrt
- * wraparound?). Should be improved if sorting while building the
- * snapshot shows up in profiles.
+ * Maintained in sorted order (by raw uint32 value) to allow efficient
+ * snapshot building without repeated sorting overhead.
*/
TransactionId *xip;
} committed;
--
2.51.0