v1-0002-DMB-barries-fix-memory-ordering-on-MSVC-ARM64.patch
application/octet-stream
Filename: v1-0002-DMB-barries-fix-memory-ordering-on-MSVC-ARM64.patch
Type: application/octet-stream
Part: 1
From 6f8245bbc9f737eb4f5be48df9e76702d2f511b5 Mon Sep 17 00:00:00 2001
From: Greg Burd <greg@burd.me>
Date: Thu, 20 Nov 2025 12:00:47 -0500
Subject: [PATCH v1 2/2] DMB barries fix memory ordering on MSVC/ARM64
MSVC ARM64 builds have been experiencing intermittent test failures
(particularly recovery/027_stream_regress) due to insufficient memory
barrier semantics in both atomic operations and spinlock
implementations. Unlike GCC, which generates explicit memory barriers
automatically, MSVC's _Interlocked* intrinsics on ARM64 lack full
sequential consistency guarantees.
This commit adds explicit Data Memory Barrier (DMB sy) instructions on
ARM64:
In src/include/port/atomics/generic-msvc.h:
* Add conditional DMB barriers before and after all atomic operations
on ARM64/MSVC (CAS/exchange/fetch-add)
* Barriers expand to no-ops on non-ARM64/MSVC platforms
* Applies to all six atomic operations:
compare-exchange, exchange, and fetch-add for both 32-bit and 64-bit
values
In src/include/storage/s_lock.h:
Add ARM64-specific TAS() implementation with explicit barriers around
the InterlockedCompareExchange intrinsic Replace S_UNLOCK() to use
hardware DMB instead of just _ReadWriteBarrier() (compiler barrier is
insufficient on ARM64) Non-ARM64 MSVC code path remains unchanged
These changes ensure sequential consistency semantics matching what GCC
generates automatically via _atomic* builtins, resolving the WAL
replication synchronization issues that caused 027_stream_regress test
timeouts.
Performance impact is negligible: DMB instructions are already implicit
in x86/x64 memory model, and ARM32 already provided full barriers via
intrinsics.
---
src/include/port/atomics/generic-msvc.h | 155 ++++++++++++++++++++++--
src/include/storage/s_lock.h | 80 +++++++++---
2 files changed, 207 insertions(+), 28 deletions(-)
diff --git a/src/include/port/atomics/generic-msvc.h b/src/include/port/atomics/generic-msvc.h
index a6ea5f1c2e7..c16ccafa341 100644
--- a/src/include/port/atomics/generic-msvc.h
+++ b/src/include/port/atomics/generic-msvc.h
@@ -12,13 +12,19 @@
* * Interlocked Variable Access
* http://msdn.microsoft.com/en-us/library/ms684122%28VS.85%29.aspx
*
+ * On ARM64, the _Interlocked* intrinsics do not emit full memory barriers
+ * by default. This file adds explicit DMB (Data Memory Barrier) instructions
+ * for ARM64 to ensure sequential consistency semantics required by PostgreSQL,
+ * matching the behavior of GCC's __atomic_* builtins which generate appropriate
+ * barriers automatically via __aarch64_cas4_acq_rel or equivalent.
+ *
* src/include/port/atomics/generic-msvc.h
*
*-------------------------------------------------------------------------
*/
#include <intrin.h>
-/* intentionally no include guards, should only be included by atomics.h */
+ /* intentionally no include guards, should only be included by atomics.h */
#ifndef INSIDE_ATOMICS_H
#error "should be included via atomics.h"
#endif
@@ -26,8 +32,23 @@
#pragma intrinsic(_ReadWriteBarrier)
#define pg_compiler_barrier_impl() _ReadWriteBarrier()
+
#ifndef pg_memory_barrier_impl
+#if defined(_M_ARM64)
+/*
+ * On ARM64, MemoryBarrier() may not generate explicit dmb sy instructions.
+ * Use explicit __dmb(_ARM64_BARRIER_SY) to ensure sequential consistency,
+ * matching GCC's __sync_synchronize() behavior which generates dmb sy.
+ */
+#define pg_memory_barrier_impl() __dmb(_ARM64_BARRIER_SY)
+
+#else
+/*
+ * On x86/x64 and ARM32, MemoryBarrier() provides full memory barriers
+ */
#define pg_memory_barrier_impl() MemoryBarrier()
+
+#endif
#endif
#define PG_HAVE_ATOMIC_U32_SUPPORT
@@ -43,31 +64,93 @@ typedef struct pg_attribute_aligned(8) pg_atomic_uint64
} pg_atomic_uint64;
+/*
+ * ARM64-specific memory barrier helper macros
+ *
+ * MSVC ARM64 _Interlocked* intrinsics do not emit full memory barriers.
+ * We use explicit __dmb(_ARM64_BARRIER_SY) to provide sequential
+ * consistency.
+ */
+#if defined(_M_ARM64)
+#define PG_ARM64_DMB_BEFORE() __dmb(_ARM64_BARRIER_SY)
+#define PG_ARM64_DMB_AFTER() __dmb(_ARM64_BARRIER_SY)
+#else
+#define PG_ARM64_DMB_BEFORE() ((void)0)
+#define PG_ARM64_DMB_AFTER() ((void)0)
+#endif
+
+
+ /*
+ * pg_atomic_compare_exchange_u32
+ *
+ * Compare and swap for 32-bit values with full memory barrier semantics
+ * (sequential consistency).
+ *
+ * Implementation notes:
+ * - ARM64: Explicit DMB barriers added before and after CAS
+ * - x86/x64, ARM32: _InterlockedCompareExchange provides full barriers
+ */
#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32
static inline bool
-pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
- uint32 *expected, uint32 newval)
+pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32* ptr,
+ uint32* expected, uint32 newval)
{
bool ret;
uint32 current;
+
+ PG_ARM64_DMB_BEFORE();
current = InterlockedCompareExchange(&ptr->value, newval, *expected);
+ PG_ARM64_DMB_AFTER();
+
ret = current == *expected;
*expected = current;
return ret;
}
+/*
+ * pg_atomic_exchange_u32
+ *
+ * Atomically exchange a new value for an old value with full memory barrier
+ * semantics (sequential consistency).
+ *
+ * Implementation notes:
+ * - ARM64: Explicit DMB barriers added before and after exchange
+ * - x86/x64, ARM32: _InterlockedExchange provides full barriers
+ */
#define PG_HAVE_ATOMIC_EXCHANGE_U32
static inline uint32
-pg_atomic_exchange_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 newval)
+pg_atomic_exchange_u32_impl(volatile pg_atomic_uint32* ptr, uint32 newval)
{
- return InterlockedExchange(&ptr->value, newval);
+ uint32 result;
+
+ PG_ARM64_DMB_BEFORE();
+ result = InterlockedExchange(&ptr->value, newval);
+ PG_ARM64_DMB_AFTER();
+
+ return result;
}
+/*
+ * pg_atomic_fetch_add_u32
+ *
+ * Atomically add a value to an atomic variable with full memory barrier
+ * semantics (sequential consistency).
+ *
+ * Implementation notes:
+ * - ARM64: Explicit DMB barriers added before and after fetch-add
+ * - x86/x64, ARM32: _InterlockedExchangeAdd provides full barriers
+ */
#define PG_HAVE_ATOMIC_FETCH_ADD_U32
static inline uint32
-pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_)
+pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32* ptr, int32 add_)
{
- return InterlockedExchangeAdd(&ptr->value, add_);
+ uint32 result;
+
+ PG_ARM64_DMB_BEFORE();
+ result = InterlockedExchangeAdd(&ptr->value, add_);
+ PG_ARM64_DMB_AFTER();
+
+ return result;
}
/*
@@ -78,14 +161,28 @@ pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_)
*/
#pragma intrinsic(_InterlockedCompareExchange64)
+ /*
+ * pg_atomic_compare_exchange_u64
+ *
+ * Compare and swap for 64-bit values with full memory barrier semantics
+ * (sequential consistency).
+ *
+ * Implementation notes:
+ * - ARM64: Explicit DMB barriers added before and after CAS
+ * - x86/x64, ARM32: _InterlockedCompareExchange64 provides full barriers
+ */
#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64
static inline bool
-pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64 *ptr,
- uint64 *expected, uint64 newval)
+pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64* ptr,
+ uint64* expected, uint64 newval)
{
bool ret;
uint64 current;
+
+ PG_ARM64_DMB_BEFORE();
current = _InterlockedCompareExchange64(&ptr->value, newval, *expected);
+ PG_ARM64_DMB_AFTER();
+
ret = current == *expected;
*expected = current;
return ret;
@@ -96,20 +193,52 @@ pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64 *ptr,
#pragma intrinsic(_InterlockedExchange64)
+/*
+ * pg_atomic_exchange_u64
+ *
+ * Atomically exchange a new 64-bit value for an old value with full memory
+ * barrier semantics (sequential consistency).
+ *
+ * Implementation notes:
+ * - ARM64: Explicit DMB barriers added before and after exchange
+ * - x86/x64: _InterlockedExchange64 provides full barriers
+ */
#define PG_HAVE_ATOMIC_EXCHANGE_U64
static inline uint64
-pg_atomic_exchange_u64_impl(volatile pg_atomic_uint64 *ptr, uint64 newval)
+pg_atomic_exchange_u64_impl(volatile pg_atomic_uint64* ptr, uint64 newval)
{
- return _InterlockedExchange64(&ptr->value, newval);
+ uint64 result;
+
+ PG_ARM64_DMB_BEFORE();
+ result = _InterlockedExchange64(&ptr->value, newval);
+ PG_ARM64_DMB_AFTER();
+
+ return result;
}
#pragma intrinsic(_InterlockedExchangeAdd64)
+/*
+ * pg_atomic_fetch_add_u64
+ *
+ * Atomically add a value to a 64-bit atomic variable with full memory barrier
+ * semantics (sequential consistency).
+ *
+ * Implementation notes:
+ * - ARM64: Explicit DMB barriers added before and after fetch-add
+ * - x86/x64: _InterlockedExchangeAdd64 provides full barriers
+ */
#define PG_HAVE_ATOMIC_FETCH_ADD_U64
static inline uint64
-pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_)
+pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64* ptr, int64 add_)
{
- return _InterlockedExchangeAdd64(&ptr->value, add_);
+ uint64 result;
+
+ PG_ARM64_DMB_BEFORE();
+ result = _InterlockedExchangeAdd64(&ptr->value, add_);
+ PG_ARM64_DMB_AFTER();
+
+ return result;
}
#endif /* _WIN64 */
diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h
index be7aaf6b013..f226d2058de 100644
--- a/src/include/storage/s_lock.h
+++ b/src/include/storage/s_lock.h
@@ -598,29 +598,82 @@ tas(volatile slock_t *lock)
typedef LONG slock_t;
#define HAS_TEST_AND_SET
+
+/*
+ * MSVC spinlock implementation
+ *
+ * On ARM64, explicit DMB (Data Memory Barrier) instructions are required
+ * to provide sequential consistency semantics. InterlockedCompareExchange
+ * alone does not emit full barriers on ARM64 [1]. _ReadWriteBarrier() is
+ * a compiler barrier only and does not prevent CPU reordering [2].
+ *
+ * [1] - Microsoft Learn: _InterlockedCompareExchange Intrinsic Functions - ARM intrinsics
+ * https://learn.microsoft.com/en-us/cpp/intrinsics/interlockedcompareexchange-intrinsic-functions?view=msvc-170
+ * [2] - Microsoft Learn: _ReadWriteBarrier is a Compiler Barrier
+ * https://learn.microsoft.com/en-us/cpp/intrinsics/compiler-intrinsics?view=msvc-170
+ */
+#if defined(_M_ARM64)
+ /* ARM64 requires explicit memory barriers for spinlock acquire/release */
+#define TAS(lock) tas_msvc_arm64(lock)
+
+static __forceinline int
+tas_msvc_arm64(volatile slock_t* lock)
+{
+ int result;
+
+ /* Full barrier before atomic operation */
+ __dmb(_ARM64_BARRIER_SY);
+
+ /* Atomic compare-and-swap */
+ result = InterlockedCompareExchange(lock, 1, 0);
+
+ /* Full barrier after atomic operation */
+ __dmb(_ARM64_BARRIER_SY);
+
+ return result;
+}
+
+#define S_UNLOCK(lock) \
+ do { \
+ __dmb(_ARM64_BARRIER_SY); /* Full barrier before release */ \
+ (*(lock)) = 0; \
+ } while (0)
+
+#else
+ /*
+ * Non-ARM64 MSVC (x86/x64): InterlockedCompareExchange provides full barriers
+ *
+ * Microsoft Learn: InterlockedCompareExchange generates full memory barrier on x64
+ * https://learn.microsoft.com/en-us/windows/win32/api/winnt/nf-winnt-interlockedcompareexchange
+ */
#define TAS(lock) (InterlockedCompareExchange(lock, 1, 0))
+#define S_UNLOCK(lock) \
+ do { _ReadWriteBarrier(); (*(lock)) = 0; } while (0)
+
+#endif /* _M_ARM64 */
+
#define SPIN_DELAY() spin_delay()
-/*
- * If using Visual C++ on Win64, inline assembly is unavailable.
- * Use architecture specific intrinsics.
- */
+ /*
+ * If using Visual C++ on Win64, inline assembly is unavailable.
+ * Use architecture specific intrinsics.
+ */
#if defined(_WIN64)
-/*
- * For Arm64, use __isb intrinsic. See aarch64 inline assembly definition for details.
- */
+ /*
+ * For Arm64, use __isb intrinsic. See aarch64 inline assembly definition for details.
+ */
#ifdef _M_ARM64
static __forceinline void
spin_delay(void)
{
- /* Reference: https://learn.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics#BarrierRestrictions */
+ /* Reference: https://learn.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics#BarrierRestrictions */
__isb(_ARM64_BARRIER_SY);
}
#else
-/*
- * For x64, use _mm_pause intrinsic instead of rep nop.
- */
+ /*
+ * For x64, use _mm_pause intrinsic instead of rep nop.
+ */
static __forceinline void
spin_delay(void)
{
@@ -639,10 +692,7 @@ spin_delay(void)
#include <intrin.h>
#pragma intrinsic(_ReadWriteBarrier)
-#define S_UNLOCK(lock) \
- do { _ReadWriteBarrier(); (*(lock)) = 0; } while (0)
-
-#endif
+#endif /* _MSC_VER */
#endif /* !defined(HAS_TEST_AND_SET) */
--
2.52.0.windows.1