v5-0002-JCN-changes.patch

application/octet-stream

Filename: v5-0002-JCN-changes.patch
Type: application/octet-stream
Part: 0
Message: Re: GB18030-2022 Support in PostgreSQL

Patch

Same data as JSON: GET /api/v1/attachments/:id/patch the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes. API reference →
Format: format-patch
Series: patch v5-0002
Subject: JCN changes
File+
src/backend/utils/mb/Unicode/UCS_to_GB18030.pl 11 27
From 0a2ab84b481acc81b974e6049c03fe3dea56e728 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Mon, 8 Sep 2025 15:29:01 +0700
Subject: [PATCH v5 2/3] JCN changes

---
 .../utils/mb/Unicode/UCS_to_GB18030.pl        | 38 ++++++-------------
 1 file changed, 11 insertions(+), 27 deletions(-)

diff --git a/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl b/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
index 658e0d59e2c..084fdf66af1 100755
--- a/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
+++ b/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
@@ -4,17 +4,15 @@
 #
 # src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
 #
-
 # Generate UTF-8 <--> GB18030 code conversion tables from
-# "gb-18030-2000.ucm", a Unicode Character Mapping file (UCM) from ICU,
-# obtained from https://github.com/unicode-org/icu-data/blob/d9d3a6ed27bb98a7106763e940258f0be8cd995b/charset/data/ucm/gb-18030-2000.ucm
+# "gb-18030-2000.ucm", obtained from
+# https://github.com/unicode-org/icu-data/tree/main/charset/data/ucm
 #
 # The lines we care about in the source file look like
 #   <UXXXX> \xYY[\xYY...] |n
-# where <UXXXX> is the Unicode code point in hex,
+# where XXXX is the Unicode code point in hex,
 # and the \xYY... is the hex byte sequence for GB18030,
 # and n is a flag indicating the type of mapping.
-#
 
 use strict;
 use warnings FATAL => 'all';
@@ -30,37 +28,23 @@ my $in_file = "gb-18030-2000.ucm";
 open(my $in, '<', $in_file) || die("cannot open $in_file");
 
 my @mapping;
-my $in_charmap = 0;
 
 while (<$in>)
 {
-	chomp;
-	# Enter CHARMAP section
-	if (/^CHARMAP/) {
-		$in_charmap = 1;
-		next;
-	}
-	# Exit CHARMAP section
-	if (/^END CHARMAP/) {
-		$in_charmap = 0;
-		last;
-	}
-	next unless $in_charmap;
-	# Skip comments and empty lines
-	next if /^#/ || /^$/;
+	# Mappings may have been removed by commenting out
+	next if /^#/;
 
-	# Match lines like: <UXXXX> \xYY[\xYY...] |n
-	next if !/^<U([0-9A-Fa-f]+)>\s+((?:\\x[0-9A-Fa-f]{2})+)\s*\|(\d+)/;
+	next if !/^<U([0-9A-Fa-f]+)>\s+
+			((?:\\x[0-9A-Fa-f]{2})+)\s+
+			\|(\d+)/x;
 	my ($u, $c, $flag) = ($1, $2, $3);
+	$c =~ s/\\x//g;
 
-	# flag 0 means round-trip mapping, we only care about that
+	# We only want round-trip mappings
 	next if ($flag ne '0');
 
 	my $ucs = hex($u);
-	# Remove \x and concatenate bytes
-	my $c_hex = $c;
-	$c_hex =~ s/\\x//g;
-	my $code = hex($c_hex);
+	my $code = hex($c);
 	if ($code >= 0x80 && $ucs >= 0x0080)
 	{
 		push @mapping,
-- 
2.39.5 (Apple Git-154)