v1-0001-Generate-EUC_CN-and-UHC-mappings-from-the-Unicode.patch
application/octet-stream
Filename: v1-0001-Generate-EUC_CN-and-UHC-mappings-from-the-Unicode.patch
Type: application/octet-stream
Part: 0
Patch
Same data as JSON:
GET /api/v1/attachments/:id/patch
the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes.
API reference →
Format: format-patch
Series: patch v1-0001
Subject: Generate EUC_CN and UHC mappings from the Unicode Consortium's UCM file
| File | + | − |
|---|---|---|
| src/backend/utils/mb/Unicode/Makefile | 4 | 4 |
| src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl | 21 | 11 |
| src/backend/utils/mb/Unicode/UCS_to_UHC.pl | 20 | 10 |
From 24a2f4514854860408dcfad4208658ae4c63d6f7 Mon Sep 17 00:00:00 2001
From: "Chao Li (Evan)" <lic@highgo.com>
Date: Wed, 24 Sep 2025 17:03:13 +0800
Subject: [PATCH v1] Generate EUC_CN and UHC mappings from the Unicode
Consortium's UCM file
This is a follow-up change of cfa6cd2, so that we can delete the XML
file from our repository.
Author: Chao Li <lic@highgo.com>
Discussion: https://postgr.es/m/966d9fc.169.198741fe60b.Coremail.jiaoshuntian%40highgo.com
---
src/backend/utils/mb/Unicode/Makefile | 8 ++---
src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl | 32 ++++++++++++-------
src/backend/utils/mb/Unicode/UCS_to_UHC.pl | 30 +++++++++++------
3 files changed, 45 insertions(+), 25 deletions(-)
diff --git a/src/backend/utils/mb/Unicode/Makefile b/src/backend/utils/mb/Unicode/Makefile
index 9f6cdcc96de..5b8a6911032 100644
--- a/src/backend/utils/mb/Unicode/Makefile
+++ b/src/backend/utils/mb/Unicode/Makefile
@@ -48,9 +48,9 @@ $(eval $(call map_rule,koi8u,UCS_to_most.pl,KOI8-U.TXT,KOI8U))
$(eval $(call map_rule,gbk,UCS_to_most.pl,CP936.TXT,GBK))
$(eval $(call map_rule,johab,UCS_to_JOHAB.pl,JOHAB.TXT))
-$(eval $(call map_rule,uhc,UCS_to_UHC.pl,windows-949-2000.xml))
+$(eval $(call map_rule,uhc,UCS_to_UHC.pl,windows-949-2000.ucm))
$(eval $(call map_rule,euc_jp,UCS_to_EUC_JP.pl,CP932.TXT JIS0212.TXT))
-$(eval $(call map_rule,euc_cn,UCS_to_EUC_CN.pl,gb-18030-2000.xml))
+$(eval $(call map_rule,euc_cn,UCS_to_EUC_CN.pl,gb-18030-2000.ucm))
$(eval $(call map_rule,euc_kr,UCS_to_EUC_KR.pl,KSX1001.TXT))
$(eval $(call map_rule,euc_tw,UCS_to_EUC_TW.pl,CNS11643.TXT))
$(eval $(call map_rule,sjis,UCS_to_SJIS.pl,CP932.TXT))
@@ -75,8 +75,8 @@ BIG5.TXT CNS11643.TXT:
euc-jis-2004-std.txt sjis-0213-2004-std.txt:
$(DOWNLOAD) http://x0213.org/codetable/$(@F)
-gb-18030-2000.xml windows-949-2000.xml:
- $(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/icu-data/master/charset/data/xml/$(@F)
+gb-18030-2000.ucm windows-949-2000.ucm:
+ $(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/icu-data/d9d3a6ed27bb98a7106763e940258f0be8cd995b/charset/data/ucm/$(@F)
gb18030-2022.ucm:
$(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/icu/refs/heads/main/icu4c/source/data/mappings/$(@F)
diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl
index f7776631e4c..d5204e57cbd 100755
--- a/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl
+++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl
@@ -2,16 +2,17 @@
#
# Copyright (c) 2007-2025, PostgreSQL Global Development Group
#
-# src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
+# src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl
#
-# Generate UTF-8 <--> GB18030 code conversion tables from
-# "gb-18030-2000.xml", obtained from
-# http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/
+# Generate UTF-8 <--> EUC_CN code conversion tables from
+# "gb-18030-2000.ucm", obtained from
+# https://github.com/unicode-org/icu-data/tree/main/charset/data/ucm
#
# The lines we care about in the source file look like
-# <a u="009A" b="81 30 83 36"/>
-# where the "u" field is the Unicode code point in hex,
-# and the "b" field is the hex byte sequence for GB18030
+# <UXXXX> \xYY[\xYY...] |n
+# where XXXX is the Unicode code point in hex,
+# and the \xYY... is the hex byte sequence for GB18030,
+# and n is a flag indicating the type of mapping.
use strict;
use warnings FATAL => 'all';
@@ -22,7 +23,7 @@ my $this_script = 'src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl';
# Read the input
-my $in_file = "gb-18030-2000.xml";
+my $in_file = "gb-18030-2000.ucm";
open(my $in, '<', $in_file) || die("cannot open $in_file");
@@ -30,9 +31,18 @@ my @mapping;
while (<$in>)
{
- next if (!m/<a u="([0-9A-F]+)" b="([0-9A-F ]+)"/);
- my ($u, $c) = ($1, $2);
- $c =~ s/ //g;
+ # Mappings may have been removed by commenting out
+ next if /^#/;
+
+ next if !/^<U([0-9A-Fa-f]+)>\s+
+ ((?:\\x[0-9A-Fa-f]{2})+)\s+
+ \|(\d+)/x;
+ my ($u, $c, $flag) = ($1, $2, $3);
+ $c =~ s/\\x//g;
+
+ # We only want round-trip mappings
+ next if ($flag ne '0');
+
my $ucs = hex($u);
my $code = hex($c);
diff --git a/src/backend/utils/mb/Unicode/UCS_to_UHC.pl b/src/backend/utils/mb/Unicode/UCS_to_UHC.pl
index c6087b5c382..e666c1839cf 100755
--- a/src/backend/utils/mb/Unicode/UCS_to_UHC.pl
+++ b/src/backend/utils/mb/Unicode/UCS_to_UHC.pl
@@ -2,16 +2,17 @@
#
# Copyright (c) 2007-2025, PostgreSQL Global Development Group
#
-# src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
+# src/backend/utils/mb/Unicode/UCS_to_UHC.pl
#
# Generate UTF-8 <--> UHC code conversion tables from
-# "windows-949-2000.xml", obtained from
-# http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/
+# "windows-949-2000.ucm", obtained from
+# https://github.com/unicode-org/icu/blob/main/icu4c/source/data/mappings/
#
# The lines we care about in the source file look like
-# <a u="009A" b="81 30 83 36"/>
-# where the "u" field is the Unicode code point in hex,
-# and the "b" field is the hex byte sequence for UHC
+# <UXXXX> \xYY[\xYY...] |n
+# where XXXX is the Unicode code point in hex,
+# and the \xYY... is the hex byte sequence for UHC,
+# and n is a flag indicating the type of mapping.
use strict;
use warnings FATAL => 'all';
@@ -22,7 +23,7 @@ my $this_script = 'src/backend/utils/mb/Unicode/UCS_to_UHC.pl';
# Read the input
-my $in_file = "windows-949-2000.xml";
+my $in_file = "windows-949-2000.ucm";
open(my $in, '<', $in_file) || die("cannot open $in_file");
@@ -30,9 +31,18 @@ my @mapping;
while (<$in>)
{
- next if (!m/<a u="([0-9A-F]+)" b="([0-9A-F ]+)"/);
- my ($u, $c) = ($1, $2);
- $c =~ s/ //g;
+ # Mappings may have been removed by commenting out
+ next if /^#/;
+
+ next if !/^<U([0-9A-Fa-f]+)>\s+
+ ((?:\\x[0-9A-Fa-f]{2})+)\s+
+ \|(\d+)/x;
+ my ($u, $c, $flag) = ($1, $2, $3);
+ $c =~ s/\\x//g;
+
+ # We only want round-trip mappings
+ next if ($flag ne '0');
+
my $ucs = hex($u);
my $code = hex($c);
--
2.39.5 (Apple Git-154)