v1-0001-Speed-up-COPY-FROM-text-CSV-parsing-using-SIMD.patch

application/octet-stream

Filename: v1-0001-Speed-up-COPY-FROM-text-CSV-parsing-using-SIMD.patch
Type: application/octet-stream
Part: 0
Message: Speed up COPY FROM text/CSV parsing using SIMD

Patch

Same data as JSON: GET /api/v1/attachments/:id/patch the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes. API reference →
Format: format-patch
Series: patch v1-0001
Subject: Speed up COPY FROM text/CSV parsing using SIMD
File+
src/backend/commands/copyfromparse.c 72 0
From 5ae3be7d262e4251bf21ac0c73b3e0ebc2ba615d Mon Sep 17 00:00:00 2001
From: Shinya Kato <shinya11.kato@gmail.com>
Date: Mon, 28 Jul 2025 22:08:20 +0900
Subject: [PATCH v1] Speed up COPY FROM text/CSV parsing using SIMD

The inner loop of CopyReadLineText scans for newlines and other special
characters by processing the input byte-by-byte. For large inputs, this
can be a performance bottleneck.

This commit introduces a SIMD-accelerated path. When not parsing inside
a quoted field, we can use vector instructions to scan the input buffer
for any character of interest in 16-byte chunks. This significantly
improves performance, especially for data with long, unquoted fields.
---
 src/backend/commands/copyfromparse.c | 72 ++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c
index b1ae97b833d..5aba0fa6cb7 100644
--- a/src/backend/commands/copyfromparse.c
+++ b/src/backend/commands/copyfromparse.c
@@ -71,7 +71,9 @@
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "pgstat.h"
+#include "port/pg_bitutils.h"
 #include "port/pg_bswap.h"
+#include "port/simd.h"
 #include "utils/builtins.h"
 #include "utils/rel.h"
 
@@ -1255,6 +1257,14 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 	char		quotec = '\0';
 	char		escapec = '\0';
 
+#ifndef USE_NO_SIMD
+	Vector8		nl = vector8_broadcast('\n');
+	Vector8		cr = vector8_broadcast('\r');
+	Vector8		bs = vector8_broadcast('\\');
+	Vector8		quote;
+	Vector8		escape;
+#endif
+
 	if (is_csv)
 	{
 		quotec = cstate->opts.quote[0];
@@ -1262,6 +1272,12 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 		/* ignore special escape processing if it's the same as quotec */
 		if (quotec == escapec)
 			escapec = '\0';
+
+#ifndef USE_NO_SIMD
+		quote = vector8_broadcast(quotec);
+		if (quotec != escapec)
+			escape = vector8_broadcast(escapec);
+#endif
 	}
 
 	/*
@@ -1328,6 +1344,62 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 			need_data = false;
 		}
 
+#ifndef USE_NO_SIMD
+		/*
+		 * SIMD instructions are used here to efficiently scan the input buffer
+		 * for special characters (e.g., newline, carriage return, quotes, or
+		 * escape characters). This approach significantly improves performance
+		 * compared to byte-by-byte iteration, especially for large input
+		 * buffers.
+		 *
+		 * However, SIMD optimization cannot be applied in the following cases:
+		 * - Inside quoted fields, where escape sequences and closing quotes
+		 *   require sequential processing to handle correctly.
+		 * - When the remaining buffer size is smaller than the size of a SIMD
+		 *   vector register, as SIMD operations require processing data in
+		 *   fixed-size chunks.
+		 */
+		if (!in_quote && copy_buf_len - input_buf_ptr >= sizeof(Vector8))
+		{
+			Vector8		chunk;
+			Vector8		match;
+			uint32		mask;
+
+			/* Load a chunk of data into a vector register */
+			vector8_load(&chunk, (const uint8 *) &copy_input_buf[input_buf_ptr]);
+
+			/* Create a mask of all special characters we need to stop at */
+			match = vector8_or(vector8_eq(chunk, nl), vector8_eq(chunk, cr));
+
+			if (is_csv)
+			{
+				match = vector8_or(match, vector8_eq(chunk, quote));
+				if (escapec != '\0')
+					match = vector8_or(match, vector8_eq(chunk, escape));
+			}
+			else
+				match = vector8_or(match, vector8_eq(chunk, bs));
+
+			/* Check if we found any special characters */
+			mask = vector8_highbit_mask(match);
+			if (mask != 0)
+			{
+				/*
+				 * Found a special character. Advance up to that point and let
+				 * the scalar code handle it.
+				 */
+				int advance = pg_rightmost_one_pos32(mask);
+				input_buf_ptr += advance;
+			}
+			else
+			{
+				/* No special characters found, so skip the entire chunk */
+				input_buf_ptr += sizeof(Vector8);
+				continue;
+			}
+		}
+#endif
+
 		/* OK to fetch a character */
 		prev_raw_ptr = input_buf_ptr;
 		c = copy_input_buf[input_buf_ptr++];
-- 
2.47.1