v25-0007-aio-Fix-pgaio_io_wait-for-staged-IOs-B.patch

application/octet-stream

Filename: v25-0007-aio-Fix-pgaio_io_wait-for-staged-IOs-B.patch
Type: application/octet-stream
Part: 1
Message: Re: index prefetching

Patch

Same data as JSON: GET /api/v1/attachments/:id/patch the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes. API reference →
Format: format-patch
Series: patch v25-0007
Subject: aio: Fix pgaio_io_wait() for staged IOs (B).
File+
src/backend/storage/aio/aio.c 42 8
src/backend/storage/aio/aio_init.c 1 0
src/backend/utils/activity/wait_event_names.txt 1 0
src/include/storage/aio_internal.h 7 0
From 667061650eab7fa95fd07b8c892102cf7a8a47f4 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Sun, 22 Mar 2026 15:12:41 -0400
Subject: [PATCH v25 7/8] aio: Fix pgaio_io_wait() for staged IOs (B).

Previously, pgaio_io_wait()'s cases for PGAIO_HS_DEFINED and
PGAIO_HS_STAGED fell through to waiting for completion.  The owner only
promises to advance it to PGAIO_HS_SUBMITTED.  The waiter needs to be
prepared to call ->wait_one() itself once the IO is submitted in order
to guarantee progress and avoid deadlocks on IO methods that provide
->wait_one().

Introduce a new per-backend condition variable submit_cv, woken by by
pgaio_submit_stage(), and use it to wait for the state to advance.  The
new broadcast doesn't seem to cause any measurable slowdown, so ideas
for optimizing the common no-waiters case were abandoned for now.

It may not be possible to reach any real deadlock with existing AIO
users, but that situation could change.  There's also no reason the
waiter shouldn't begin to wait via the IO method as soon as possible
even without a deadlock.

Picked up by testing a proposed IO method that has ->wait_one(), like
io_method=io_uring, and code review.

Backpatch-through: 18
Reviewed-by: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/CA%2BhUKG%2BmZYrSdnhk-XrBYO18H829K77S9gMKUsykOiTJtqB43g%40mail.gmail.com
---
 src/include/storage/aio_internal.h            |  7 +++
 src/backend/storage/aio/aio.c                 | 50 ++++++++++++++++---
 src/backend/storage/aio/aio_init.c            |  1 +
 .../utils/activity/wait_event_names.txt       |  1 +
 4 files changed, 51 insertions(+), 8 deletions(-)

diff --git a/src/include/storage/aio_internal.h b/src/include/storage/aio_internal.h
index 9ca4087aa..c3afde4d5 100644
--- a/src/include/storage/aio_internal.h
+++ b/src/include/storage/aio_internal.h
@@ -216,6 +216,13 @@ typedef struct PgAioBackend
 	uint16		num_staged_ios;
 	PgAioHandle *staged_ios[PGAIO_SUBMIT_BATCH_SIZE];
 
+	/*
+	 * Other backends sometimes need to wait for the owning backend to submit.
+	 * The per-IO CV would work for that purpose, but a per-backend CV allows
+	 * for just one broadcast per submitted batch.
+	 */
+	ConditionVariable submit_cv;
+
 	/*
 	 * List of in-flight IOs. Also contains IOs that aren't strictly speaking
 	 * in-flight anymore, but have been waited-for and completed by another
diff --git a/src/backend/storage/aio/aio.c b/src/backend/storage/aio/aio.c
index 7009fb7f6..ce2f73bff 100644
--- a/src/backend/storage/aio/aio.c
+++ b/src/backend/storage/aio/aio.c
@@ -593,6 +593,16 @@ pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState
 	return ioh->generation != ref_generation;
 }
 
+/*
+ * Whether we need to wait via the IO method. Don't check via the IO method if
+ * the issuing backend is executing the IO synchronously.
+ */
+static bool
+pgaio_io_needs_wait_one(PgAioHandle *ioh)
+{
+	return pgaio_method_ops->wait_one && !(ioh->flags & PGAIO_HF_SYNCHRONOUS);
+}
+
 /*
  * Wait for IO to complete. External code should never use this, outside of
  * the AIO subsystem waits are only allowed via pgaio_wref_wait().
@@ -632,23 +642,38 @@ pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
 				elog(ERROR, "IO in wrong state: %d", state);
 				break;
 
-			case PGAIO_HS_SUBMITTED:
+			case PGAIO_HS_DEFINED:
+			case PGAIO_HS_STAGED:
 
 				/*
-				 * If we need to wait via the IO method, do so now. Don't
-				 * check via the IO method if the issuing backend is executing
-				 * the IO synchronously.
+				 * The owner hasn't submitted the IO yet. If we need to wait
+				 * via the IO method, wait for submission, giving this backend
+				 * the chance to call ->wait_one().
 				 */
-				if (pgaio_method_ops->wait_one && !(ioh->flags & PGAIO_HF_SYNCHRONOUS))
+				if (pgaio_io_needs_wait_one(ioh))
+				{
+					PgAioBackend *backend = &pgaio_ctl->backend_state[ioh->owner_procno];
+
+					ConditionVariablePrepareToSleep(&backend->submit_cv);
+					while (!pgaio_io_was_recycled(ioh, ref_generation, &state) &&
+						   (state == PGAIO_HS_DEFINED ||
+							state == PGAIO_HS_STAGED))
+						ConditionVariableSleep(&backend->submit_cv, WAIT_EVENT_AIO_IO_SUBMIT);
+					ConditionVariableCancelSleep();
+					continue;
+				}
+				pg_fallthrough;
+
+			case PGAIO_HS_SUBMITTED:
+
+				/* If we need to wait via the IO method, do so now. */
+				if (pgaio_io_needs_wait_one(ioh))
 				{
 					pgaio_method_ops->wait_one(ioh, ref_generation);
 					continue;
 				}
 				pg_fallthrough;
 
-				/* waiting for owner to submit */
-			case PGAIO_HS_DEFINED:
-			case PGAIO_HS_STAGED:
 				/* waiting for reaper to complete */
 				/* fallthrough */
 			case PGAIO_HS_COMPLETED_IO:
@@ -1226,6 +1251,15 @@ pgaio_submit_staged(void)
 
 	pgaio_my_backend->num_staged_ios = 0;
 
+	/*
+	 * Wake any backend that started waiting for any of these IOs before
+	 * submission, if it is necessary to call ->wait_one() to guarantee
+	 * progress with the configured IO method.  On its side, pgaio_io_wait()
+	 * only waits for submit_cv on IO methods needing that.
+	 */
+	if (pgaio_method_ops->wait_one)
+		ConditionVariableBroadcast(&pgaio_my_backend->submit_cv);
+
 	pgaio_debug(DEBUG4,
 				"aio: submitted %d IOs",
 				total_submitted);
diff --git a/src/backend/storage/aio/aio_init.c b/src/backend/storage/aio/aio_init.c
index da30d792a..6fc00a917 100644
--- a/src/backend/storage/aio/aio_init.c
+++ b/src/backend/storage/aio/aio_init.c
@@ -199,6 +199,7 @@ AioShmemInit(void *arg)
 
 		dclist_init(&bs->idle_ios);
 		memset(bs->staged_ios, 0, sizeof(PgAioHandle *) * PGAIO_SUBMIT_BATCH_SIZE);
+		ConditionVariableInit(&bs->submit_cv);
 		dclist_init(&bs->in_flight_ios);
 
 		/* initialize per-backend IOs */
diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt
index 560659f95..7c3326348 100644
--- a/src/backend/utils/activity/wait_event_names.txt
+++ b/src/backend/utils/activity/wait_event_names.txt
@@ -200,6 +200,7 @@ ABI_compatibility:
 Section: ClassName - WaitEventIO
 
 AIO_IO_COMPLETION	"Waiting for another process to complete IO."
+AIO_IO_SUBMIT	"Waiting for another process to submit IO."
 AIO_IO_URING_SUBMIT	"Waiting for IO submission via io_uring."
 AIO_IO_URING_EXECUTION	"Waiting for IO execution via io_uring."
 BASEBACKUP_READ	"Waiting for base backup to read from a file."
-- 
2.53.0