0002-Atomically-test-and-set-PHJ-RIGHT_SEMI-match-flags.patch

text/x-patch

Filename: 0002-Atomically-test-and-set-PHJ-RIGHT_SEMI-match-flags.patch
Type: text/x-patch
Part: 1
Message: Re: [EXTERNAL]Re: BUG #19094: select statement on postgres 17 vs postgres 18 is returning different/duplicate results

Patch

Same data as JSON: GET /api/v1/attachments/:id/patch the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes. API reference →
Format: format-patch
Series: patch 0002
Subject: Atomically test-and-set PHJ RIGHT_SEMI match flags.
File+
src/backend/executor/nodeHashjoin.c 28 2
src/include/access/htup_details.h 12 0
src/include/port/atomics.h 54 0
From d26f8ff4e9ba62563f55bb47eb82fe14fb424024 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Tue, 11 Nov 2025 13:37:49 +1300
Subject: [PATCH 2/2] Atomically test-and-set PHJ RIGHT_SEMI match flags.

Fix the race that led to PHJ RIGHT_SEMI being reverted from v18 (commit
257ee783), by introducing an introducing an atomic test-and-set function
for match flags..  Each tuple can be emitted exactly once on first match
in any backend, when a RIGHT_SEMI join is executed by PHJ.

TODO: not seriously tested, focusing more on the atomic portability
questions than the RIGHT_SEMI correctness (see patch for concerns)

TODO: figure out if it's worth specializing this function for (1) no
match tracking, (2) relaxed match tracking and (3) atomic match
tracking, but for now it's tested at runtime

Discussion: https://postgr.es/m/CA%2BhUKGK2ken0RAHV1crSaS6zNWDkSWxKdb%3DS9BSxAUo0CfZx7g%40mail.gmail.com
---
 src/backend/executor/nodeHashjoin.c | 30 ++++++++++++++--
 src/include/access/htup_details.h   | 12 +++++++
 src/include/port/atomics.h          | 54 +++++++++++++++++++++++++++++
 3 files changed, 94 insertions(+), 2 deletions(-)

diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c
index 5661ad76830..e0d5230d9cd 100644
--- a/src/backend/executor/nodeHashjoin.c
+++ b/src/backend/executor/nodeHashjoin.c
@@ -560,8 +560,34 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel)
 					 * we are in a right-semijoin, but we'll avoid the branch
 					 * and just set it always.
 					 */
-					if (!HeapTupleHeaderHasMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple)))
-						HeapTupleHeaderSetMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple));
+					if (parallel && node->js.jointype == JOIN_RIGHT_SEMI)
+					{
+						/*
+						 * The earlier check for a match flag was relaxed and
+						 * might have seen an arbitrarily out-of-date value.
+						 * Now we use an atomic check, to make sure that only
+						 * one backend sets the match flag and emits this
+						 * tuple.  Any others skip the tuple having wasted a
+						 * few cycles.
+						 */
+						if (HeapTupleHeaderTestAndSetMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple)))
+							continue;
+					}
+					else
+					{
+						/*
+						 * In serial joins there is no concurrency to worry
+						 * about, and in parallel joins other than
+						 * JOIN_RIGHT_SEMI, the match flags won't be consulted
+						 * until after synchronizing on a barrier and it
+						 * doesn't matter if multiple backends sometimes set
+						 * it concurrently, so we can be quite sloppy here. We
+						 * still avoid setting it if we can already see it's
+						 * set, just to avoid dirtying memory unnecessarily.
+						 */
+						if (!HeapTupleHeaderHasMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple)))
+							HeapTupleHeaderSetMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple));
+					}
 
 					/* In an antijoin, we never return a matched tuple */
 					if (node->js.jointype == JOIN_ANTI)
diff --git a/src/include/access/htup_details.h b/src/include/access/htup_details.h
index f3593acc8c2..8424a57bdac 100644
--- a/src/include/access/htup_details.h
+++ b/src/include/access/htup_details.h
@@ -18,6 +18,7 @@
 #include "access/transam.h"
 #include "access/tupdesc.h"
 #include "access/tupmacs.h"
+#include "port/atomics.h"
 #include "storage/bufpage.h"
 #include "varatt.h"
 
@@ -719,6 +720,17 @@ HeapTupleHeaderSetMatch(MinimalTupleData *tup)
 	tup->t_infomask2 |= HEAP_TUPLE_HAS_MATCH;
 }
 
+/*
+ * Atomically set the match flag and report whether it was already set.  False
+ * means that the caller was the first to set it.
+ */
+static inline bool
+HeapTupleHeaderTestAndSetMatch(MinimalTupleData *tup)
+{
+	return atomic_fetch_or(pg_atomic_cast(&tup->t_infomask2),
+						   HEAP_TUPLE_HAS_MATCH) & HEAP_TUPLE_HAS_MATCH;
+}
+
 static inline void
 HeapTupleHeaderClearMatch(MinimalTupleData *tup)
 {
diff --git a/src/include/port/atomics.h b/src/include/port/atomics.h
index 61a6b4322fd..79299e0f479 100644
--- a/src/include/port/atomics.h
+++ b/src/include/port/atomics.h
@@ -66,6 +66,8 @@ extern "C++"
 #define pg_atomic(T) _Atomic(T)
 #endif
 
+#include <stdalign.h>
+
 /* Assumptions about lock-free access. */
 StaticAssertDecl(ATOMIC_CHAR_LOCK_FREE >= 2, "need lock-free 8-bit atomics");
 StaticAssertDecl(ATOMIC_SHORT_LOCK_FREE >= 2, "need lock-free 16-bit atomics");
@@ -85,6 +87,58 @@ pg_atomic_uint32;
 typedef pg_atomic(uint64)
 pg_atomic_uint64;
 
+/* With care, it is safe to cast pointers to lock-free atomic types. */
+/*
+ * XXX Here's a nasty complication: the following assertions might be true in
+ * practice today (?), but would break down if we one day want to add uint64
+ * to the set, since i386 has alignof(uint64) == 4 but
+ * alignof(pg_atomic_uint64) == 8.  Maybe that'd be OK because we can't require
+ * lock-free 64-bit until we drop armv7, and we'd likely drop i386 at the same
+ * time, but...
+ *
+ * The alternatives seem to be:
+ *
+ * 1.  Make things like t_infomask2 actually pg_atomic_uint16 in the first
+ * place, and deal with the spreading consequences all over the tree, ie
+ * relaxed atomic accesses everywhere.  This is annoying because I strongly
+ * suspect the cast will work on any platform (even those that don't have
+ * native 16 bit atomics eg some RISC-V variants have that fact hidden from us
+ * by the compiler widening ops).
+ *
+ * 2.  Don't cast, instead have anonymous unions in the right places: then
+ * t_infomask2 and t_infomask2_atomic could provide syntaxless type punning,
+ * and it might be easier to reason about the aligment (lock-free guarantee
+ * seems to imply size is the same so we'd just assert that).
+ *
+ * 3.  Accept that some casts are OK with careful analysis and more information
+ * about portability risks.
+ *
+ * XXX t_infomask2 might be a sort of test case for future potential use of
+ * lock-free access to shared data structures that previously had course
+ * grained locking (ultimately perhaps buffer pool contents, ie concurrent
+ * tuple insertion).
+ */
+#define PG_ATOMIC_CHECK_CASTABLE(type) \
+	StaticAssertDecl(sizeof(type) == sizeof(pg_atomic(type)), \
+					 #type " has bad size for pg_atomic_cast"); \
+	StaticAssertDecl(alignof(type) == alignof(pg_atomic(type)), \
+					 #type " has bad alignment for pg_atomic_cast");
+PG_ATOMIC_CHECK_CASTABLE(uint8);
+PG_ATOMIC_CHECK_CASTABLE(uint16);
+PG_ATOMIC_CHECK_CASTABLE(uint32);
+
+/* Cast T* -> pg_atomic(T)*, lock-free alignment-compatible types only. */
+#ifdef __cplusplus
+#define pg_atomic_cast(P) \
+	((pg_atomic(std::remove_pointer<decltype(P)>::type) *)(P))
+#else
+#define pg_atomic_cast(P) _Generic((P), \
+	uint32 *: (((pg_atomic(uint32) *) (P))), \
+	uint16 *: (((pg_atomic(uint16) *) (P))), \
+	uint8 *:  (((pg_atomic(uint8)  *) (P))) \
+)
+#endif
+
 /*
  * First a set of architecture specific files is included.
  *
-- 
2.51.1