From de1d8c438c74cbb0b8bba70172f02e746db21a05 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Mon, 20 Oct 2025 16:32:18 -0700
Subject: [PATCH v11 8/9] downcase_identifier(): use method table from locale
 provider.

Previously, libc's tolower() was always used for identifier case
folding, regardless of the database locale (though only characters
beyond 127 in single-byte encodings were affected). Refactor to allow
each provider to supply its own implementation of identifier
casefolding.

For historical compatibility, when using a single-byte encoding, ICU
still relies on tolower().

One minor behavior change is that, before the database default locale
is initialized, it uses ASCII semantics to fold the
identifiers. Previously, it would use the postmaster's LC_CTYPE
setting from the environment. While that could have some effect during
GUC processing, for example, it would have been fragile to rely on the
environment setting anyway. (Also, it only matters when the encoding
is single-byte.)

Discussion: https://postgr.es/m/450ceb6260cad30d7afdf155d991a9caafee7c0d.camel@j-davis.com
---
 src/backend/parser/scansup.c              | 39 +++++++---------
 src/backend/utils/adt/pg_locale.c         | 32 +++++++++++++
 src/backend/utils/adt/pg_locale_builtin.c | 24 ++++++++++
 src/backend/utils/adt/pg_locale_icu.c     | 36 ++++++++++++++-
 src/backend/utils/adt/pg_locale_libc.c    | 55 +++++++++++++++++++++++
 src/include/utils/pg_locale.h             |  5 +++
 6 files changed, 166 insertions(+), 25 deletions(-)

diff --git a/src/backend/parser/scansup.c b/src/backend/parser/scansup.c
index 2feb2b6cf5a..0bd049643d1 100644
--- a/src/backend/parser/scansup.c
+++ b/src/backend/parser/scansup.c
@@ -18,6 +18,7 @@
 
 #include "mb/pg_wchar.h"
 #include "parser/scansup.h"
+#include "utils/pg_locale.h"
 
 
 /*
@@ -46,35 +47,25 @@ char *
 downcase_identifier(const char *ident, int len, bool warn, bool truncate)
 {
 	char	   *result;
-	int			i;
-	bool		enc_is_single_byte;
-
-	result = palloc(len + 1);
-	enc_is_single_byte = pg_database_encoding_max_length() == 1;
+	size_t		dstsize;
+	size_t		needed pg_attribute_unused();
 
 	/*
-	 * SQL99 specifies Unicode-aware case normalization, which we don't yet
-	 * have the infrastructure for.  Instead we use tolower() to provide a
-	 * locale-aware translation.  However, there are some locales where this
-	 * is not right either (eg, Turkish may do strange things with 'i' and
-	 * 'I').  Our current compromise is to use tolower() for characters with
-	 * the high bit set, as long as they aren't part of a multi-byte
-	 * character, and use an ASCII-only downcasing for 7-bit characters.
+	 * Preserves string length.
+	 *
+	 * NB: if we decide to support Unicode-aware identifier case folding, then
+	 * we need to account for a change in string length.
 	 */
-	for (i = 0; i < len; i++)
-	{
-		unsigned char ch = (unsigned char) ident[i];
+	dstsize = len + 1;
+	result = palloc(dstsize);
 
-		if (ch >= 'A' && ch <= 'Z')
-			ch += 'a' - 'A';
-		else if (enc_is_single_byte && IS_HIGHBIT_SET(ch) && isupper(ch))
-			ch = tolower(ch);
-		result[i] = (char) ch;
-	}
-	result[i] = '\0';
+	needed = pg_strfold_ident(result, dstsize, ident, len);
+	Assert(needed + 1 == dstsize);
+	Assert(needed == len);
+	Assert(result[len] == '\0');
 
-	if (i >= NAMEDATALEN && truncate)
-		truncate_identifier(result, i, warn);
+	if (len >= NAMEDATALEN && truncate)
+		truncate_identifier(result, len, warn);
 
 	return result;
 }
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index c4e89502f85..9167018c85b 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1352,6 +1352,38 @@ pg_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 		return locale->ctype->strfold(dst, dstsize, src, srclen, locale);
 }
 
+/*
+ * Fold an identifier using the database default locale.
+ *
+ * For historical reasons, does not use ordinary locale behavior. Should only
+ * be used for identifier folding. XXX: can we make this equivalent to
+ * pg_strfold(..., default_locale)?
+ */
+size_t
+pg_strfold_ident(char *dest, size_t destsize, const char *src, ssize_t srclen)
+{
+	if (default_locale == NULL || default_locale->ctype == NULL)
+	{
+		int			i;
+
+		for (i = 0; i < srclen && i < destsize; i++)
+		{
+			unsigned char ch = (unsigned char) src[i];
+
+			if (ch >= 'A' && ch <= 'Z')
+				ch += 'a' - 'A';
+			dest[i] = (char) ch;
+		}
+
+		if (i < destsize)
+			dest[i] = '\0';
+
+		return srclen;
+	}
+	return default_locale->ctype->strfold_ident(dest, destsize, src, srclen,
+												default_locale);
+}
+
 /*
  * pg_strcoll
  *
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 0c2920112bb..659e588d513 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -125,6 +125,29 @@ strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
 						   locale->builtin.casemap_full);
 }
 
+static size_t
+strfold_ident_builtin(char *dst, size_t dstsize, const char *src,
+					  ssize_t srclen, pg_locale_t locale)
+{
+	int			i;
+
+	Assert(GetDatabaseEncoding() == PG_UTF8);
+
+	for (i = 0; i < srclen && i < dstsize; i++)
+	{
+		unsigned char ch = (unsigned char) src[i];
+
+		if (ch >= 'A' && ch <= 'Z')
+			ch += 'a' - 'A';
+		dst[i] = (char) ch;
+	}
+
+	if (i < dstsize)
+		dst[i] = '\0';
+
+	return srclen;
+}
+
 static bool
 wc_isdigit_builtin(pg_wchar wc, pg_locale_t locale)
 {
@@ -208,6 +231,7 @@ static const struct ctype_methods ctype_methods_builtin = {
 	.strtitle = strtitle_builtin,
 	.strupper = strupper_builtin,
 	.strfold = strfold_builtin,
+	.strfold_ident = strfold_ident_builtin,
 	.wc_isdigit = wc_isdigit_builtin,
 	.wc_isalpha = wc_isalpha_builtin,
 	.wc_isalnum = wc_isalnum_builtin,
diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c
index 18d026deda8..39b153a4262 100644
--- a/src/backend/utils/adt/pg_locale_icu.c
+++ b/src/backend/utils/adt/pg_locale_icu.c
@@ -61,6 +61,8 @@ static size_t strupper_icu(char *dest, size_t destsize, const char *src,
 						   ssize_t srclen, pg_locale_t locale);
 static size_t strfold_icu(char *dest, size_t destsize, const char *src,
 						  ssize_t srclen, pg_locale_t locale);
+static size_t strfold_ident_icu(char *dst, size_t dstsize, const char *src,
+								ssize_t srclen, pg_locale_t locale);
 static int	strncoll_icu(const char *arg1, ssize_t len1,
 						 const char *arg2, ssize_t len2,
 						 pg_locale_t locale);
@@ -123,7 +125,7 @@ static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity,
 
 /*
  * XXX: many of the functions below rely on casts directly from pg_wchar to
- * UChar32, which is correct for the UTF-8 encoding, but not in general.
+ * UChar32, which is correct for UTF-8 and LATIN1, but not in general.
  */
 
 static pg_wchar
@@ -227,6 +229,7 @@ static const struct ctype_methods ctype_methods_icu = {
 	.strtitle = strtitle_icu,
 	.strupper = strupper_icu,
 	.strfold = strfold_icu,
+	.strfold_ident = strfold_ident_icu,
 	.wc_isdigit = wc_isdigit_icu,
 	.wc_isalpha = wc_isalpha_icu,
 	.wc_isalnum = wc_isalnum_icu,
@@ -564,6 +567,37 @@ strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
 	return result_len;
 }
 
+/*
+ * For historical compatibility, behavior is not multibyte-aware.
+ *
+ * NB: uses libc tolower() for single-byte encodings (also for historical
+ * compatibility), and therefore relies on the global LC_CTYPE setting.
+ */
+static size_t
+strfold_ident_icu(char *dst, size_t dstsize, const char *src,
+				  ssize_t srclen, pg_locale_t locale)
+{
+	int			i;
+	bool		enc_is_single_byte;
+
+	enc_is_single_byte = pg_database_encoding_max_length() == 1;
+	for (i = 0; i < srclen && i < dstsize; i++)
+	{
+		unsigned char ch = (unsigned char) src[i];
+
+		if (ch >= 'A' && ch <= 'Z')
+			ch += 'a' - 'A';
+		else if (enc_is_single_byte && IS_HIGHBIT_SET(ch) && isupper(ch))
+			ch = tolower(ch);
+		dst[i] = (char) ch;
+	}
+
+	if (i < dstsize)
+		dst[i] = '\0';
+
+	return srclen;
+}
+
 /*
  * strncoll_icu_utf8
  *
diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c
index 4cb3c64b4a6..85c7885a8ae 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -318,12 +318,65 @@ tolower_libc_mb(pg_wchar wc, pg_locale_t locale)
 		return wc;
 }
 
+/*
+ * Characters A..Z always fold to a..z, even in the Turkish locale. Characters
+ * beyond 127 use tolower().
+ */
+static size_t
+strfold_ident_libc_sb(char *dst, size_t dstsize, const char *src,
+					  ssize_t srclen, pg_locale_t locale)
+{
+	locale_t	loc = locale->lt;
+	int			i;
+
+	for (i = 0; i < srclen && i < dstsize; i++)
+	{
+		unsigned char ch = (unsigned char) src[i];
+
+		if (ch >= 'A' && ch <= 'Z')
+			ch += 'a' - 'A';
+		else if (IS_HIGHBIT_SET(ch) && isupper_l(ch, loc))
+			ch = tolower_l(ch, loc);
+		dst[i] = (char) ch;
+	}
+
+	if (i < dstsize)
+		dst[i] = '\0';
+
+	return srclen;
+}
+
+/*
+ * For historical reasons, not multibyte-aware; uses plain ASCII semantics.
+ */
+static size_t
+strfold_ident_libc_mb(char *dst, size_t dstsize, const char *src,
+					  ssize_t srclen, pg_locale_t locale)
+{
+	int			i;
+
+	for (i = 0; i < srclen && i < dstsize; i++)
+	{
+		unsigned char ch = (unsigned char) src[i];
+
+		if (ch >= 'A' && ch <= 'Z')
+			ch += 'a' - 'A';
+		dst[i] = (char) ch;
+	}
+
+	if (i < dstsize)
+		dst[i] = '\0';
+
+	return srclen;
+}
+
 static const struct ctype_methods ctype_methods_libc_sb = {
 	.strlower = strlower_libc_sb,
 	.strtitle = strtitle_libc_sb,
 	.strupper = strupper_libc_sb,
 	/* in libc, casefolding is the same as lowercasing */
 	.strfold = strlower_libc_sb,
+	.strfold_ident = strfold_ident_libc_sb,
 	.wc_isdigit = wc_isdigit_libc_sb,
 	.wc_isalpha = wc_isalpha_libc_sb,
 	.wc_isalnum = wc_isalnum_libc_sb,
@@ -349,6 +402,7 @@ static const struct ctype_methods ctype_methods_libc_other_mb = {
 	.strupper = strupper_libc_mb,
 	/* in libc, casefolding is the same as lowercasing */
 	.strfold = strlower_libc_mb,
+	.strfold_ident = strfold_ident_libc_mb,
 	.wc_isdigit = wc_isdigit_libc_sb,
 	.wc_isalpha = wc_isalpha_libc_sb,
 	.wc_isalnum = wc_isalnum_libc_sb,
@@ -370,6 +424,7 @@ static const struct ctype_methods ctype_methods_libc_utf8 = {
 	.strupper = strupper_libc_mb,
 	/* in libc, casefolding is the same as lowercasing */
 	.strfold = strlower_libc_mb,
+	.strfold_ident = strfold_ident_libc_mb,
 	.wc_isdigit = wc_isdigit_libc_mb,
 	.wc_isalpha = wc_isalpha_libc_mb,
 	.wc_isalnum = wc_isalnum_libc_mb,
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index 01f891def7a..53574d2ef85 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -110,6 +110,9 @@ struct ctype_methods
 	size_t		(*strfold) (char *dest, size_t destsize,
 							const char *src, ssize_t srclen,
 							pg_locale_t locale);
+	size_t		(*strfold_ident) (char *dest, size_t destsize,
+								  const char *src, ssize_t srclen,
+								  pg_locale_t locale);
 
 	/* required */
 	bool		(*wc_isdigit) (pg_wchar wc, pg_locale_t locale);
@@ -188,6 +191,8 @@ extern size_t pg_strupper(char *dst, size_t dstsize,
 extern size_t pg_strfold(char *dst, size_t dstsize,
 						 const char *src, ssize_t srclen,
 						 pg_locale_t locale);
+extern size_t pg_strfold_ident(char *dst, size_t dstsize,
+							   const char *src, ssize_t srclen);
 extern int	pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale);
 extern int	pg_strncoll(const char *arg1, ssize_t len1,
 						const char *arg2, ssize_t len2, pg_locale_t locale);
-- 
2.43.0

