0002-Atomically-test-and-set-PHJ-RIGHT_SEMI-match-flags.patch
text/x-patch
Filename: 0002-Atomically-test-and-set-PHJ-RIGHT_SEMI-match-flags.patch
Type: text/x-patch
Part: 1
Patch
Same data as JSON:
GET /api/v1/attachments/:id/patch
the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes.
API reference →
Format: format-patch
Series: patch 0002
Subject: Atomically test-and-set PHJ RIGHT_SEMI match flags.
| File | + | − |
|---|---|---|
| src/backend/executor/nodeHashjoin.c | 28 | 2 |
| src/include/access/htup_details.h | 12 | 0 |
| src/include/port/atomics.h | 54 | 0 |
From d26f8ff4e9ba62563f55bb47eb82fe14fb424024 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Tue, 11 Nov 2025 13:37:49 +1300
Subject: [PATCH 2/2] Atomically test-and-set PHJ RIGHT_SEMI match flags.
Fix the race that led to PHJ RIGHT_SEMI being reverted from v18 (commit
257ee783), by introducing an introducing an atomic test-and-set function
for match flags.. Each tuple can be emitted exactly once on first match
in any backend, when a RIGHT_SEMI join is executed by PHJ.
TODO: not seriously tested, focusing more on the atomic portability
questions than the RIGHT_SEMI correctness (see patch for concerns)
TODO: figure out if it's worth specializing this function for (1) no
match tracking, (2) relaxed match tracking and (3) atomic match
tracking, but for now it's tested at runtime
Discussion: https://postgr.es/m/CA%2BhUKGK2ken0RAHV1crSaS6zNWDkSWxKdb%3DS9BSxAUo0CfZx7g%40mail.gmail.com
---
src/backend/executor/nodeHashjoin.c | 30 ++++++++++++++--
src/include/access/htup_details.h | 12 +++++++
src/include/port/atomics.h | 54 +++++++++++++++++++++++++++++
3 files changed, 94 insertions(+), 2 deletions(-)
diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c
index 5661ad76830..e0d5230d9cd 100644
--- a/src/backend/executor/nodeHashjoin.c
+++ b/src/backend/executor/nodeHashjoin.c
@@ -560,8 +560,34 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel)
* we are in a right-semijoin, but we'll avoid the branch
* and just set it always.
*/
- if (!HeapTupleHeaderHasMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple)))
- HeapTupleHeaderSetMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple));
+ if (parallel && node->js.jointype == JOIN_RIGHT_SEMI)
+ {
+ /*
+ * The earlier check for a match flag was relaxed and
+ * might have seen an arbitrarily out-of-date value.
+ * Now we use an atomic check, to make sure that only
+ * one backend sets the match flag and emits this
+ * tuple. Any others skip the tuple having wasted a
+ * few cycles.
+ */
+ if (HeapTupleHeaderTestAndSetMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple)))
+ continue;
+ }
+ else
+ {
+ /*
+ * In serial joins there is no concurrency to worry
+ * about, and in parallel joins other than
+ * JOIN_RIGHT_SEMI, the match flags won't be consulted
+ * until after synchronizing on a barrier and it
+ * doesn't matter if multiple backends sometimes set
+ * it concurrently, so we can be quite sloppy here. We
+ * still avoid setting it if we can already see it's
+ * set, just to avoid dirtying memory unnecessarily.
+ */
+ if (!HeapTupleHeaderHasMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple)))
+ HeapTupleHeaderSetMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple));
+ }
/* In an antijoin, we never return a matched tuple */
if (node->js.jointype == JOIN_ANTI)
diff --git a/src/include/access/htup_details.h b/src/include/access/htup_details.h
index f3593acc8c2..8424a57bdac 100644
--- a/src/include/access/htup_details.h
+++ b/src/include/access/htup_details.h
@@ -18,6 +18,7 @@
#include "access/transam.h"
#include "access/tupdesc.h"
#include "access/tupmacs.h"
+#include "port/atomics.h"
#include "storage/bufpage.h"
#include "varatt.h"
@@ -719,6 +720,17 @@ HeapTupleHeaderSetMatch(MinimalTupleData *tup)
tup->t_infomask2 |= HEAP_TUPLE_HAS_MATCH;
}
+/*
+ * Atomically set the match flag and report whether it was already set. False
+ * means that the caller was the first to set it.
+ */
+static inline bool
+HeapTupleHeaderTestAndSetMatch(MinimalTupleData *tup)
+{
+ return atomic_fetch_or(pg_atomic_cast(&tup->t_infomask2),
+ HEAP_TUPLE_HAS_MATCH) & HEAP_TUPLE_HAS_MATCH;
+}
+
static inline void
HeapTupleHeaderClearMatch(MinimalTupleData *tup)
{
diff --git a/src/include/port/atomics.h b/src/include/port/atomics.h
index 61a6b4322fd..79299e0f479 100644
--- a/src/include/port/atomics.h
+++ b/src/include/port/atomics.h
@@ -66,6 +66,8 @@ extern "C++"
#define pg_atomic(T) _Atomic(T)
#endif
+#include <stdalign.h>
+
/* Assumptions about lock-free access. */
StaticAssertDecl(ATOMIC_CHAR_LOCK_FREE >= 2, "need lock-free 8-bit atomics");
StaticAssertDecl(ATOMIC_SHORT_LOCK_FREE >= 2, "need lock-free 16-bit atomics");
@@ -85,6 +87,58 @@ pg_atomic_uint32;
typedef pg_atomic(uint64)
pg_atomic_uint64;
+/* With care, it is safe to cast pointers to lock-free atomic types. */
+/*
+ * XXX Here's a nasty complication: the following assertions might be true in
+ * practice today (?), but would break down if we one day want to add uint64
+ * to the set, since i386 has alignof(uint64) == 4 but
+ * alignof(pg_atomic_uint64) == 8. Maybe that'd be OK because we can't require
+ * lock-free 64-bit until we drop armv7, and we'd likely drop i386 at the same
+ * time, but...
+ *
+ * The alternatives seem to be:
+ *
+ * 1. Make things like t_infomask2 actually pg_atomic_uint16 in the first
+ * place, and deal with the spreading consequences all over the tree, ie
+ * relaxed atomic accesses everywhere. This is annoying because I strongly
+ * suspect the cast will work on any platform (even those that don't have
+ * native 16 bit atomics eg some RISC-V variants have that fact hidden from us
+ * by the compiler widening ops).
+ *
+ * 2. Don't cast, instead have anonymous unions in the right places: then
+ * t_infomask2 and t_infomask2_atomic could provide syntaxless type punning,
+ * and it might be easier to reason about the aligment (lock-free guarantee
+ * seems to imply size is the same so we'd just assert that).
+ *
+ * 3. Accept that some casts are OK with careful analysis and more information
+ * about portability risks.
+ *
+ * XXX t_infomask2 might be a sort of test case for future potential use of
+ * lock-free access to shared data structures that previously had course
+ * grained locking (ultimately perhaps buffer pool contents, ie concurrent
+ * tuple insertion).
+ */
+#define PG_ATOMIC_CHECK_CASTABLE(type) \
+ StaticAssertDecl(sizeof(type) == sizeof(pg_atomic(type)), \
+ #type " has bad size for pg_atomic_cast"); \
+ StaticAssertDecl(alignof(type) == alignof(pg_atomic(type)), \
+ #type " has bad alignment for pg_atomic_cast");
+PG_ATOMIC_CHECK_CASTABLE(uint8);
+PG_ATOMIC_CHECK_CASTABLE(uint16);
+PG_ATOMIC_CHECK_CASTABLE(uint32);
+
+/* Cast T* -> pg_atomic(T)*, lock-free alignment-compatible types only. */
+#ifdef __cplusplus
+#define pg_atomic_cast(P) \
+ ((pg_atomic(std::remove_pointer<decltype(P)>::type) *)(P))
+#else
+#define pg_atomic_cast(P) _Generic((P), \
+ uint32 *: (((pg_atomic(uint32) *) (P))), \
+ uint16 *: (((pg_atomic(uint16) *) (P))), \
+ uint8 *: (((pg_atomic(uint8) *) (P))) \
+)
+#endif
+
/*
* First a set of architecture specific files is included.
*
--
2.51.1