v1-0001-Speed-up-COPY-FROM-text-CSV-parsing-using-SIMD.patch
application/octet-stream
Filename: v1-0001-Speed-up-COPY-FROM-text-CSV-parsing-using-SIMD.patch
Type: application/octet-stream
Part: 0
Patch
Same data as JSON:
GET /api/v1/attachments/:id/patch
the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes.
API reference →
Format: format-patch
Series: patch v1-0001
Subject: Speed up COPY FROM text/CSV parsing using SIMD
| File | + | − |
|---|---|---|
| src/backend/commands/copyfromparse.c | 72 | 0 |
From 5ae3be7d262e4251bf21ac0c73b3e0ebc2ba615d Mon Sep 17 00:00:00 2001
From: Shinya Kato <shinya11.kato@gmail.com>
Date: Mon, 28 Jul 2025 22:08:20 +0900
Subject: [PATCH v1] Speed up COPY FROM text/CSV parsing using SIMD
The inner loop of CopyReadLineText scans for newlines and other special
characters by processing the input byte-by-byte. For large inputs, this
can be a performance bottleneck.
This commit introduces a SIMD-accelerated path. When not parsing inside
a quoted field, we can use vector instructions to scan the input buffer
for any character of interest in 16-byte chunks. This significantly
improves performance, especially for data with long, unquoted fields.
---
src/backend/commands/copyfromparse.c | 72 ++++++++++++++++++++++++++++
1 file changed, 72 insertions(+)
diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c
index b1ae97b833d..5aba0fa6cb7 100644
--- a/src/backend/commands/copyfromparse.c
+++ b/src/backend/commands/copyfromparse.c
@@ -71,7 +71,9 @@
#include "mb/pg_wchar.h"
#include "miscadmin.h"
#include "pgstat.h"
+#include "port/pg_bitutils.h"
#include "port/pg_bswap.h"
+#include "port/simd.h"
#include "utils/builtins.h"
#include "utils/rel.h"
@@ -1255,6 +1257,14 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
char quotec = '\0';
char escapec = '\0';
+#ifndef USE_NO_SIMD
+ Vector8 nl = vector8_broadcast('\n');
+ Vector8 cr = vector8_broadcast('\r');
+ Vector8 bs = vector8_broadcast('\\');
+ Vector8 quote;
+ Vector8 escape;
+#endif
+
if (is_csv)
{
quotec = cstate->opts.quote[0];
@@ -1262,6 +1272,12 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
/* ignore special escape processing if it's the same as quotec */
if (quotec == escapec)
escapec = '\0';
+
+#ifndef USE_NO_SIMD
+ quote = vector8_broadcast(quotec);
+ if (quotec != escapec)
+ escape = vector8_broadcast(escapec);
+#endif
}
/*
@@ -1328,6 +1344,62 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
need_data = false;
}
+#ifndef USE_NO_SIMD
+ /*
+ * SIMD instructions are used here to efficiently scan the input buffer
+ * for special characters (e.g., newline, carriage return, quotes, or
+ * escape characters). This approach significantly improves performance
+ * compared to byte-by-byte iteration, especially for large input
+ * buffers.
+ *
+ * However, SIMD optimization cannot be applied in the following cases:
+ * - Inside quoted fields, where escape sequences and closing quotes
+ * require sequential processing to handle correctly.
+ * - When the remaining buffer size is smaller than the size of a SIMD
+ * vector register, as SIMD operations require processing data in
+ * fixed-size chunks.
+ */
+ if (!in_quote && copy_buf_len - input_buf_ptr >= sizeof(Vector8))
+ {
+ Vector8 chunk;
+ Vector8 match;
+ uint32 mask;
+
+ /* Load a chunk of data into a vector register */
+ vector8_load(&chunk, (const uint8 *) ©_input_buf[input_buf_ptr]);
+
+ /* Create a mask of all special characters we need to stop at */
+ match = vector8_or(vector8_eq(chunk, nl), vector8_eq(chunk, cr));
+
+ if (is_csv)
+ {
+ match = vector8_or(match, vector8_eq(chunk, quote));
+ if (escapec != '\0')
+ match = vector8_or(match, vector8_eq(chunk, escape));
+ }
+ else
+ match = vector8_or(match, vector8_eq(chunk, bs));
+
+ /* Check if we found any special characters */
+ mask = vector8_highbit_mask(match);
+ if (mask != 0)
+ {
+ /*
+ * Found a special character. Advance up to that point and let
+ * the scalar code handle it.
+ */
+ int advance = pg_rightmost_one_pos32(mask);
+ input_buf_ptr += advance;
+ }
+ else
+ {
+ /* No special characters found, so skip the entire chunk */
+ input_buf_ptr += sizeof(Vector8);
+ continue;
+ }
+ }
+#endif
+
/* OK to fetch a character */
prev_raw_ptr = input_buf_ptr;
c = copy_input_buf[input_buf_ptr++];
--
2.47.1