v2-0001-Add-file_extend_method-posix_fallocate-write_zero.patch

text/x-patch

Filename: v2-0001-Add-file_extend_method-posix_fallocate-write_zero.patch
Type: text/x-patch
Part: 0
Message: Re: [PING] fallocate() causes btrfs to never compress postgresql files
From 58ec33550147e324e5a6a8793c8e502b9e7065f2 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Sat, 31 May 2025 22:50:22 +1200
Subject: [PATCH v2 1/3] Add file_extend_method=posix_fallocate,write_zeros.

Provide a way to disable the use of posix_fallocate() for relation
files.  It was introduced by commit 4d330a61bb1.  The new setting
file_extend_method=write_zeros can be used as a workaround for problems
reported from the field:

 * BTRFS compression is disabled by the use of posix_fallocate()
 * XFS users have reported a few cases of spurious ENOSPC that haven't
   been explained yet

The default is file_extend_method=posix_fallocate as before.  The new
mode is simlar to PostgreSQL < 16, except that bulk extension writes
zeros for multiple blocks at a time.

Backpatch-through: 16
Reviewed-by: Jakub Wartak <jakub.wartak@enterprisedb.com>
Reported-by: Dimitrios Apostolou <jimis@gmx.net>
Discussion: https://postgr.es/m/b1843124-fd22-e279-a31f-252dffb6fbf2%40gmx.net
---
 doc/src/sgml/config.sgml                      | 37 +++++++++++++++++++
 src/backend/storage/file/fd.c                 |  3 ++
 src/backend/storage/smgr/md.c                 | 21 ++++++++---
 src/backend/utils/misc/guc_parameters.dat     |  7 ++++
 src/backend/utils/misc/guc_tables.c           |  9 +++++
 src/backend/utils/misc/postgresql.conf.sample |  4 ++
 src/include/storage/fd.h                      | 11 ++++++
 7 files changed, 87 insertions(+), 5 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 405c9689bd0..0b4922b35c4 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2410,6 +2410,43 @@ include_dir 'conf.d'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-file-extend-method" xreflabel="file_extend_method">
+      <term><varname>file_extend_method</varname> (<type>enum</type>)
+      <indexterm>
+       <primary><varname>file_extend_method</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Specifies the method used to extend data files during bulk operations
+        such as <command>COPY</command>.  The first available option is used as
+        the default, depending on the operating system:
+        <itemizedlist>
+         <listitem>
+          <para>
+           <literal>posix_fallocate</literal> (Unix) uses the standard POSIX
+            interface for allocating disk space, but is missing on some systems.
+            If it is present but the underlying file system doesn't support it,
+            this option silently falls back to <literal>write_zeros</literal>.
+            Current versions of BTRFS are known to disable compression when
+            this option is used.
+            This is the default on systems that have the function.
+           </para>
+         </listitem>
+         <listitem>
+          <para>
+           <literal>write_zeros</literal> extends files by writing out blocks
+            of zero bytes.  This is the default on systems that don't have the
+            function <function>posix_fallocate</function>.
+          </para>
+         </listitem>
+        </itemizedlist>
+        The <literal>write_zeros</literal> method is always used when data
+        files are extended by 8 blocks or fewer.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-max-notify-queue-pages" xreflabel="max_notify_queue_pages">
       <term><varname>max_notify_queue_pages</varname> (<type>integer</type>)
       <indexterm>
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 9670e809b72..a2fd55cc408 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -164,6 +164,9 @@ bool		data_sync_retry = false;
 /* How SyncDataDirectory() should do its job. */
 int			recovery_init_sync_method = DATA_DIR_SYNC_METHOD_FSYNC;
 
+/* How data files should be bulk-extended with zeros. */
+int			file_extend_method = DEFAULT_FILE_EXTEND_METHOD;
+
 /* Which kinds of files should be opened with PG_O_DIRECT. */
 int			io_direct_flags;
 
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 71bcdeb6601..df0aa20708d 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -602,13 +602,24 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum,
 		 * that decision should be made though? For now just use a cutoff of
 		 * 8, anything between 4 and 8 worked OK in some local testing.
 		 */
-		if (numblocks > 8)
+		if (numblocks > 8 &&
+			file_extend_method != FILE_EXTEND_METHOD_WRITE_ZEROS)
 		{
-			int			ret;
+			int			ret = 0;
 
-			ret = FileFallocate(v->mdfd_vfd,
-								seekpos, (pgoff_t) BLCKSZ * numblocks,
-								WAIT_EVENT_DATA_FILE_EXTEND);
+#ifdef HAVE_POSIX_FALLOCATE
+			if (file_extend_method == FILE_EXTEND_METHOD_POSIX_FALLOCATE)
+			{
+				ret = FileFallocate(v->mdfd_vfd,
+									seekpos, (pgoff_t) BLCKSZ * numblocks,
+									WAIT_EVENT_DATA_FILE_EXTEND);
+			}
+			else
+#endif
+			{
+				elog(ERROR, "unsupported file_extend_method: %d",
+					 file_extend_method);
+			}
 			if (ret != 0)
 			{
 				ereport(ERROR,
diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat
index 3b9d8349078..220a092ef52 100644
--- a/src/backend/utils/misc/guc_parameters.dat
+++ b/src/backend/utils/misc/guc_parameters.dat
@@ -1039,6 +1039,13 @@
   options => 'file_copy_method_options',
 },
 
+{ name => 'file_extend_method', type => 'enum', context => 'PGC_SIGHUP', group => 'RESOURCES_DISK',
+  short_desc => 'Selects the method used for extending data files.',
+  variable => 'file_extend_method',
+  boot_val => 'DEFAULT_FILE_EXTEND_METHOD',
+  options => 'file_extend_method_options',
+},
+
 { name => 'from_collapse_limit', type => 'int', context => 'PGC_USERSET', group => 'QUERY_TUNING_OTHER',
   short_desc => 'Sets the FROM-list size beyond which subqueries are not collapsed.',
   long_desc => 'The planner will merge subqueries into upper queries if the resulting FROM list would have no more than this many items.',
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index f87b558c2c6..6c65a47a88d 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -80,6 +80,7 @@
 #include "storage/bufmgr.h"
 #include "storage/bufpage.h"
 #include "storage/copydir.h"
+#include "storage/fd.h"
 #include "storage/io_worker.h"
 #include "storage/large_object.h"
 #include "storage/pg_shmem.h"
@@ -491,6 +492,14 @@ static const struct config_enum_entry file_copy_method_options[] = {
 	{NULL, 0, false}
 };
 
+static const struct config_enum_entry file_extend_method_options[] = {
+#ifdef HAVE_POSIX_FALLOCATE
+	{"posix_fallocate", FILE_EXTEND_METHOD_POSIX_FALLOCATE, false},
+#endif
+	{"write_zeros", FILE_EXTEND_METHOD_WRITE_ZEROS, false},
+	{NULL, 0, false}
+};
+
 /*
  * Options for enum values stored in other modules
  */
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index dc9e2255f8a..753a42e8ca5 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -179,6 +179,10 @@
                                         # in kilobytes, or -1 for no limit
 
 #file_copy_method = copy                # copy, clone (if supported by OS)
+#file_extend_method = posix_fallocate   # the default is the first option supported
+                                        # by the operating system:
+                                        #   posix_fallocate (most Unix-like systems)
+                                        #   write_zeros
 
 #max_notify_queue_pages = 1048576       # limits the number of SLRU pages allocated
                                         # for NOTIFY / LISTEN queue
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index a8b0c9b3997..f21ac4545a8 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -55,12 +55,23 @@ typedef int File;
 #define IO_DIRECT_WAL			0x02
 #define IO_DIRECT_WAL_INIT		0x04
 
+enum FileExtendMethod
+{
+#ifdef HAVE_POSIX_FALLOCATE
+	FILE_EXTEND_METHOD_POSIX_FALLOCATE,
+#endif
+	FILE_EXTEND_METHOD_WRITE_ZEROS,
+};
+
+/* Default to the first available file_extend_method. */
+#define DEFAULT_FILE_EXTEND_METHOD 0
 
 /* GUC parameter */
 extern PGDLLIMPORT int max_files_per_process;
 extern PGDLLIMPORT bool data_sync_retry;
 extern PGDLLIMPORT int recovery_init_sync_method;
 extern PGDLLIMPORT int io_direct_flags;
+extern PGDLLIMPORT int file_extend_method;
 
 /*
  * This is private to fd.c, but exported for save/restore_backend_variables()
-- 
2.51.2