Thread

  1. IO in wrong state on riscv64

    Alexander Lakhin <exclusion@gmail.com> — 2025-10-11T13:00:00Z

    Hello Andres,
    
    I've spotted one more interesting AIO-related failure on a riscv64 machine
    [1]:
    === dumping /home/gburd/build/HEAD/pgsql.build/testrun/recovery/027_stream_regress/data/regression.diffs ===
    diff -U3 /home/gburd/build/HEAD/pgsql/src/test/regress/expected/tsearch.out 
    /home/gburd/build/HEAD/pgsql.build/testrun/recovery/027_stream_regress/data/results/tsearch.out
    --- /home/gburd/build/HEAD/pgsql/src/test/regress/expected/tsearch.out 2025-10-09 00:06:10.612959611 +0000
    +++ /home/gburd/build/HEAD/pgsql.build/testrun/recovery/027_stream_regress/data/results/tsearch.out 2025-10-09 
    00:27:46.697588198 +0000
    @@ -163,11 +163,7 @@
      (1 row)
    
      SELECT count(*) FROM test_tsvector WHERE a @@ '!qe <2> qt';
    - count
    --------
    -     6
    -(1 row)
    -
    +ERROR:  IO in wrong state: 0
      SELECT count(*) FROM test_tsvector WHERE a @@ '!(pl <-> yh)';
       count
      -------
    
    I've managed to reproduce it using qemu-system-riscv64 with Debian trixie
    (6.11.10-riscv64, clang 19.1.4):
    $ CC=clang-19 ./configure --enable-debug --enable-cassert --enable-tap-tests && time make -j8 -s
    ...
    configure: using CFLAGS=-Wall -Wmissing-prototypes -Wpointer-arith -Wdeclaration-after-statement -Werror=vla 
    -Werror=unguarded-availability-new -Wendif-labels -Wmissing-format-attribute -Wcast-function-type -Wformat-security 
    -Wmissing-variable-declarations -fno-strict-aliasing -fwrapv -fexcess-precision=standard 
    -Wno-unused-command-line-argument -Wno-compound-token-split-by-macro -Wno-format-truncation 
    -Wno-cast-function-type-strict -g -O2
    
    when running `make check` in a loop (it fails within 10 iterations for me)
    with the reduced parallel_schedule:
    test: test_setup
    
    test: boolean char name varchar text int2 int4 int8 oid float8
    x10
    
    and extra.config:
    log_line_prefix = '%m [%p:%l] %q%a '
    log_connections = 'true'
    log_disconnections = 'true'
    log_statement = 'all'
    fsync = off
    restart_after_crash = off
    
    debug_parallel_query = regress
    
    shared_buffers =  512kB
    
    backtrace_functions = 'pgaio_io_wait'
    # mostly reflects greenfly's config, plus shared_buffers decreased
    # below 027_stream_regress's 1MB.
    
    With the following debug logging added:
    --- a/src/backend/storage/aio/aio.c
    +++ b/src/backend/storage/aio/aio.c
    @@ -578,6 +578,20 @@ pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
             PgAioHandleState state;
             bool            am_owner;
    
    +int inistate = ioh->state;
    +if (inistate == PGAIO_HS_IDLE)
    +{
    +  fprintf(stderr, "!!!pgaio_io_wait[%d]| initial ioh->state: %d\n", getpid(), inistate);
    +}
    +for (int i = 0; i < 10; i++)
    +{
    +  int curstate = ioh->state;
    +  if (curstate != inistate)
    +  {
    +    elog(PANIC, "!!!pgaio_io_wait| ioh->state changed from %d to %d at iteration %d", inistate, curstate, i);
    +    break;
    +  }
    +}
             am_owner = ioh->owner_procno == MyProcNumber;
    
             if (pgaio_io_was_recycled(ioh, ref_generation, &state))
    @@ -599,6 +613,10 @@ pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
             {
                     if (pgaio_io_was_recycled(ioh, ref_generation, &state))
                             return;
    +if (ioh->state == PGAIO_HS_IDLE)
    +{
    +  fprintf(stderr, "!!!pgaio_io_wait[%d]| ioh->state: %d (initial state: %d)\n", getpid(), ioh->state, inistate);
    +}
    
                     switch ((PgAioHandleState) state)
                     {
    
    I'm observing several cases:
    1)
    2025-10-11 11:45:17.947 UTC [1191355:258] pg_regress/float8 LOG: statement: SELECT x,
                    sind(x),
                    sind(x) IN (-1,-0.5,0,0.5,1) AS sind_exact
             FROM (VALUES (0), (30), (90), (150), (180),
                   (210), (270), (330), (360)) AS t(x);
    2025-10-11 11:45:17.947 UTC [1191359:206] pg_regress/int8 LOG: statement: SELECT * FROM 
    generate_series('+4567890123456789'::int8, '+4567890123456799'::int8);
    !!!pgaio_io_wait[1191744]| ioh->state: 0 (initial state: 5)
    2025-10-11 11:45:17.993 GMT [1191744:1] ERROR:  IO in wrong state: 0
    2025-10-11 11:45:17.993 GMT [1191744:2] BACKTRACE:
             postgres: parallel worker for PID 1191355 (+0x4372a0) [0x555561b8a2a0]
             postgres: parallel worker for PID 1191355 (+0x442c66) [0x555561b95c66]
             postgres: parallel worker for PID 1191355 (StartBufferIO+0x66) [0x555561b95ac4]
             postgres: parallel worker for PID 1191355 (+0x43e014) [0x555561b91014]
             postgres: parallel worker for PID 1191355 (StartReadBuffer+0x308) [0x555561b90958]
    ...
             postgres: parallel worker for PID 1191355 (+0x3194b2) [0x555561a6c4b2]
             /lib/riscv64-linux-gnu/libc.so.6(+0x2791c) [0x7fffb9ef791c]
             /lib/riscv64-linux-gnu/libc.so.6(__libc_start_main+0x74) [0x7fffb9ef79c4]
             postgres: parallel worker for PID 1191355 (_start+0x20) [0x5555618382c8]
    ...
    2025-10-11 11:45:17.999 UTC [1191355:259] pg_regress/float8 ERROR: IO in wrong state: 0
    2025-10-11 11:45:17.999 UTC [1191355:260] pg_regress/float8 BACKTRACE:
             postgres: debian regression [local] SELECT(ProcessParallelMessages+0x114) [0x5555618dca6c]
             postgres: debian regression [local] SELECT(ProcessInterrupts+0x340) [0x555561bce864]
             postgres: debian regression [local] SELECT(+0x2e0e9c) [0x555561a33e9c]
             postgres: debian regression [local] SELECT(standard_ExecutorRun+0x10c) [0x555561a19c20]
    ...
             /lib/riscv64-linux-gnu/libc.so.6(+0x2791c) [0x7fffb9ef791c]
             /lib/riscv64-linux-gnu/libc.so.6(__libc_start_main+0x74) [0x7fffb9ef79c4]
             postgres: debian regression [local] SELECT(_start+0x20) [0x5555618382c8]
    2025-10-11 11:45:17.999 UTC [1191355:261] pg_regress/float8 STATEMENT:  SELECT x,
                    sind(x),
                    sind(x) IN (-1,-0.5,0,0.5,1) AS sind_exact
             FROM (VALUES (0), (30), (90), (150), (180),
                   (210), (270), (330), (360)) AS t(x);
    2025-10-11 11:45:17.999 UTC [1189893:98] LOG:  background worker "parallel worker" (PID 1191744) exited with exit code 1
    
    2)
    !!!pgaio_io_wait[1353521]| ioh->state: 0 (initial state: 4)
    ...
    2025-10-11 12:48:00.503 UTC [1353521:1] ERROR:  IO in wrong state: 0
    2025-10-11 12:48:00.503 UTC [1353521:2] BACKTRACE:
             postgres: parallel worker for PID 1353449 (+0x4372a0) [0x5555905562a0]
             postgres: parallel worker for PID 1353449 (+0x442c66) [0x555590561c66]
             postgres: parallel worker for PID 1353449 (StartBufferIO+0x66) [0x555590561ac4]
             postgres: parallel worker for PID 1353449 (+0x43e014) [0x55559055d014]
             postgres: parallel worker for PID 1353449 (StartReadBuffer+0x308) [0x55559055c958]
    ...
             /lib/riscv64-linux-gnu/libc.so.6(+0x2791c) [0x7fffaacf791c]
             /lib/riscv64-linux-gnu/libc.so.6(__libc_start_main+0x74) [0x7fffaacf79c4]
             postgres: parallel worker for PID 1353449 (_start+0x20) [0x5555902042c8]
    2025-10-11 12:48:00.503 UTC [1353521:3] STATEMENT:  SELECT c.f1 FROM NAME_TBL c WHERE c.f1 ~ '[0-9]';
    ...
    2025-10-11 12:48:00.512 UTC [1352470:51] LOG:  background worker "parallel worker" (PID 1353521) exited with exit code 1
    2025-10-11 12:48:00.513 UTC [1353449:24] pg_regress/name ERROR:  IO in wrong state: 0
    2025-10-11 12:48:00.513 UTC [1353449:25] pg_regress/name BACKTRACE:
             postgres: debian regression [local] SELECT(ProcessParallelMessages+0x114) [0x5555902a8a6c]
             postgres: debian regression [local] SELECT(ProcessInterrupts+0x340) [0x55559059a864]
             postgres: debian regression [local] SELECT(+0x2e0e9c) [0x5555903ffe9c]
             postgres: debian regression [local] SELECT(standard_ExecutorRun+0x10c) [0x5555903e5c20]
             postgres: debian regression [local] SELECT(+0x481a30) [0x5555905a0a30]
             postgres: debian regression [local] SELECT(PortalRun+0x180) [0x5555905a06e6]
             postgres: debian regression [local] SELECT(+0x480880) [0x55559059f880]
             postgres: debian regression [local] SELECT(PostgresMain+0xdfc) [0x55559059d7d8]
             postgres: debian regression [local] SELECT(+0x4793f0) [0x5555905983f0]
             postgres: debian regression [local] SELECT(postmaster_child_launch+0x132) [0x5555904f787c]
             postgres: debian regression [local] SELECT(+0x3dcd52) [0x5555904fbd52]
             postgres: debian regression [local] SELECT(InitProcessGlobals+0) [0x5555904f9c96]
             postgres: debian regression [local] SELECT(+0x3194b2) [0x5555904384b2]
             /lib/riscv64-linux-gnu/libc.so.6(+0x2791c) [0x7fffaacf791c]
             /lib/riscv64-linux-gnu/libc.so.6(__libc_start_main+0x74) [0x7fffaacf79c4]
             postgres: debian regression [local] SELECT(_start+0x20) [0x5555902042c8]
    2025-10-11 12:48:00.513 UTC [1353449:26] pg_regress/name STATEMENT: SELECT c.f1 FROM NAME_TBL c WHERE c.f1 ~ '[0-9]';
    
    I also observed authentication failed due to this error.
    
    3)
    !!!pgaio_io_wait[1169773]| initial ioh->state: 0
    
    2025-10-11 11:34:46.793 GMT [1169773:1] PANIC:  !!!pgaio_io_wait| ioh->state changed from 0 to 1 at iteration 0
    # no other iteration number observed
    2025-10-11 11:34:46.793 GMT [1169773:2] BACKTRACE:
             postgres: parallel worker for PID 1169684 (+0x4372a0) [0x55555c3492a0]
             postgres: parallel worker for PID 1169684 (+0x442c66) [0x55555c354c66]
             postgres: parallel worker for PID 1169684 (StartBufferIO+0x66) [0x55555c354ac4]
             postgres: parallel worker for PID 1169684 (+0x43e014) [0x55555c350014]
             postgres: parallel worker for PID 1169684 (StartReadBuffer+0x308) [0x55555c34f958]
    ...
             /lib/riscv64-linux-gnu/libc.so.6(+0x2791c) [0x7fff8faf791c]
             /lib/riscv64-linux-gnu/libc.so.6(__libc_start_main+0x74) [0x7fff8faf79c4]
             postgres: parallel worker for PID 1169684 (_start+0x20) [0x55555bff72c8]
    
    2025-10-11 11:34:46.804 UTC [1169684:93] pg_regress/text ERROR: !!!pgaio_io_wait| ioh->state changed from 0 to 1 at 
    iteration 0
    2025-10-11 11:34:46.804 UTC [1169684:94] pg_regress/text BACKTRACE:
             postgres: debian regression [local] SELECT(ProcessParallelMessages+0x114) [0x55555c09ba6c]
             postgres: debian regression [local] SELECT(ProcessInterrupts+0x340) [0x55555c38d864]
             postgres: debian regression [local] SELECT(+0x2e0e9c) [0x55555c1f2e9c]
             postgres: debian regression [local] SELECT(standard_ExecutorRun+0x10c) [0x55555c1d8c20]
    ...
             /lib/riscv64-linux-gnu/libc.so.6(+0x2791c) [0x7fff8faf791c]
             /lib/riscv64-linux-gnu/libc.so.6(__libc_start_main+0x74) [0x7fff8faf79c4]
             postgres: debian regression [local] SELECT(_start+0x20) [0x55555bff72c8]
    With CFLAGS="-O0", it fails much sooner...
    
    Probably the latter case is OK, I just wanted to check if the field
    is apparently changed on the fly...
    
    
    [1] https://buildfarm.postgresql.org/cgi-bin/show_log.pl?nm=greenfly&dt=2025-10-09%2000%3A06%3A03
    
    Best regards,
    Alexander