v2-0001-GB18030-Switch-to-using-gb-18030-2000.ucm.patch
text/plain
Filename: v2-0001-GB18030-Switch-to-using-gb-18030-2000.ucm.patch
Type: text/plain
Part: 1
Patch
Same data as JSON:
GET /api/v1/attachments/:id/patch
the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes.
API reference →
Format: format-patch
Series: patch v2-0001
Subject: GB18030: Switch to using gb-18030-2000.ucm
| File | + | − |
|---|---|---|
| src/backend/utils/mb/Unicode/Makefile | 4 | 1 |
| src/backend/utils/mb/Unicode/UCS_to_GB18030.pl | 36 | 10 |
From 075a4d4e56ba9d79f36bb8305edb912ab1d6bc87 Mon Sep 17 00:00:00 2001
From: "Chao Li (Evan)" <lic@highgo.com>
Date: Mon, 11 Aug 2025 18:06:07 +0800
Subject: [PATCH v2] GB18030: Switch to using gb-18030-2000.ucm
This is the first in a series of patches to upgrade GB18030 encoding
from the 2000 standard to the 2022 standard.
In this patch, UCS_to_GB18030.pl is modified to use gb-18030-2000.ucm.
We do not check the UCM file into the source tree; to build the map
files, run:
make gb18030_to_utf8.map
from src/backend/utils/mb/Unicode.
Note that the gb-18030-2000.ucm used here is not the latest version
from GitHub. A newer version exists that fixes a mapping error. To
keep this patch focused, we are using a version of gb-18030-2000.ucm
that matches the current gb-18030-2000.xml, so the mapping files
remain unchanged in this commit.
The next patch will update gb-18030-2022.ucm to the latest version.
Author: Chao Li <lic@highgo.com>
Discussion: https://www.postgresql.org/message-id/flat/966d9fc.169.198741fe60b.Coremail.jiaoshuntian@highgo.com
---
src/backend/utils/mb/Unicode/Makefile | 5 +-
.../utils/mb/Unicode/UCS_to_GB18030.pl | 46 +++++++++++++++----
2 files changed, 40 insertions(+), 11 deletions(-)
diff --git a/src/backend/utils/mb/Unicode/Makefile b/src/backend/utils/mb/Unicode/Makefile
index ad789b31e54..27424b2a001 100644
--- a/src/backend/utils/mb/Unicode/Makefile
+++ b/src/backend/utils/mb/Unicode/Makefile
@@ -54,7 +54,7 @@ $(eval $(call map_rule,euc_cn,UCS_to_EUC_CN.pl,gb-18030-2000.xml))
$(eval $(call map_rule,euc_kr,UCS_to_EUC_KR.pl,KSX1001.TXT))
$(eval $(call map_rule,euc_tw,UCS_to_EUC_TW.pl,CNS11643.TXT))
$(eval $(call map_rule,sjis,UCS_to_SJIS.pl,CP932.TXT))
-$(eval $(call map_rule,gb18030,UCS_to_GB18030.pl,gb-18030-2000.xml))
+$(eval $(call map_rule,gb18030,UCS_to_GB18030.pl,gb-18030-2000.ucm))
$(eval $(call map_rule,big5,UCS_to_BIG5.pl,CP950.TXT BIG5.TXT CP950.TXT))
$(eval $(call map_rule,euc_jis_2004,UCS_to_EUC_JIS_2004.pl,euc-jis-2004-std.txt))
$(eval $(call map_rule,shift_jis_2004,UCS_to_SHIFT_JIS_2004.pl,sjis-0213-2004-std.txt))
@@ -78,6 +78,9 @@ euc-jis-2004-std.txt sjis-0213-2004-std.txt:
gb-18030-2000.xml windows-949-2000.xml:
$(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/icu-data/master/charset/data/xml/$(@F)
+gb-18030-2000.ucm:
+ $(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/icu-data/d9d3a6ed27bb98a7106763e940258f0be8cd995b/charset/data/ucm/$(@F)
+
GB2312.TXT:
$(DOWNLOAD) 'http://trac.greenstone.org/browser/trunk/gsdl/unicode/MAPPINGS/EASTASIA/GB/GB2312.TXT?rev=1842&format=txt'
diff --git a/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl b/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
index ddcbd6ef0c4..658e0d59e2c 100755
--- a/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
+++ b/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
@@ -4,14 +4,17 @@
#
# src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
#
+
# Generate UTF-8 <--> GB18030 code conversion tables from
-# "gb-18030-2000.xml", obtained from
-# http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/
+# "gb-18030-2000.ucm", a Unicode Character Mapping file (UCM) from ICU,
+# obtained from https://github.com/unicode-org/icu-data/blob/d9d3a6ed27bb98a7106763e940258f0be8cd995b/charset/data/ucm/gb-18030-2000.ucm
#
# The lines we care about in the source file look like
-# <a u="009A" b="81 30 83 36"/>
-# where the "u" field is the Unicode code point in hex,
-# and the "b" field is the hex byte sequence for GB18030
+# <UXXXX> \xYY[\xYY...] |n
+# where <UXXXX> is the Unicode code point in hex,
+# and the \xYY... is the hex byte sequence for GB18030,
+# and n is a flag indicating the type of mapping.
+#
use strict;
use warnings FATAL => 'all';
@@ -22,19 +25,42 @@ my $this_script = 'src/backend/utils/mb/Unicode/UCS_to_GB18030.pl';
# Read the input
-my $in_file = "gb-18030-2000.xml";
+my $in_file = "gb-18030-2000.ucm";
open(my $in, '<', $in_file) || die("cannot open $in_file");
my @mapping;
+my $in_charmap = 0;
while (<$in>)
{
- next if (!m/<a u="([0-9A-F]+)" b="([0-9A-F ]+)"/);
- my ($u, $c) = ($1, $2);
- $c =~ s/ //g;
+ chomp;
+ # Enter CHARMAP section
+ if (/^CHARMAP/) {
+ $in_charmap = 1;
+ next;
+ }
+ # Exit CHARMAP section
+ if (/^END CHARMAP/) {
+ $in_charmap = 0;
+ last;
+ }
+ next unless $in_charmap;
+ # Skip comments and empty lines
+ next if /^#/ || /^$/;
+
+ # Match lines like: <UXXXX> \xYY[\xYY...] |n
+ next if !/^<U([0-9A-Fa-f]+)>\s+((?:\\x[0-9A-Fa-f]{2})+)\s*\|(\d+)/;
+ my ($u, $c, $flag) = ($1, $2, $3);
+
+ # flag 0 means round-trip mapping, we only care about that
+ next if ($flag ne '0');
+
my $ucs = hex($u);
- my $code = hex($c);
+ # Remove \x and concatenate bytes
+ my $c_hex = $c;
+ $c_hex =~ s/\\x//g;
+ my $code = hex($c_hex);
if ($code >= 0x80 && $ucs >= 0x0080)
{
push @mapping,
--
2.39.5 (Apple Git-154)