From f8cf19f4764de42851f7b98ce652e8e2ece6af40 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Fri, 21 Nov 2025 12:14:21 -0800
Subject: [PATCH v9 08/11] Use multibyte-aware extraction of pattern prefixes.

Previously, like_fixed_prefix() used char-at-a-time logic, which
forced it to be too conservative for case-insensitive matching.

Now, use pg_wchar-at-a-time loop for text types, along with proper
detection of cased characters; and preserve and char-at-a-time logic
for bytea.

Removes the pg_locale_t char_is_cased() single-byte method and
replaces it with a proper multibyte pg_iswcased() method.
---
 src/backend/utils/adt/like_support.c      | 111 +++++++++++++---------
 src/backend/utils/adt/pg_locale.c         |  26 +++--
 src/backend/utils/adt/pg_locale_builtin.c |   7 +-
 src/backend/utils/adt/pg_locale_icu.c     |  15 ++-
 src/backend/utils/adt/pg_locale_libc.c    |  23 +++--
 src/include/utils/pg_locale.h             |   5 +-
 6 files changed, 103 insertions(+), 84 deletions(-)

diff --git a/src/backend/utils/adt/like_support.c b/src/backend/utils/adt/like_support.c
index 0debccfa67b..e7255fa652a 100644
--- a/src/backend/utils/adt/like_support.c
+++ b/src/backend/utils/adt/like_support.c
@@ -987,12 +987,11 @@ static Pattern_Prefix_Status
 like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
 				  Const **prefix_const, Selectivity *rest_selec)
 {
-	char	   *match;
 	char	   *patt;
 	int			pattlen;
 	Oid			typeid = patt_const->consttype;
-	int			pos,
-				match_pos;
+	int			pos;
+	int			match_pos = 0;
 	pg_locale_t locale = 0;
 
 	/* the right-hand const is type text or bytea */
@@ -1020,67 +1019,91 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
 		locale = pg_newlocale_from_collation(collation);
 	}
 
+	/* for text types, use pg_wchar; for BYTEA, use char */
 	if (typeid != BYTEAOID)
 	{
-		patt = TextDatumGetCString(patt_const->constvalue);
-		pattlen = strlen(patt);
+		text	   *val = DatumGetTextPP(patt_const->constvalue);
+		pg_wchar   *wpatt;
+		pg_wchar   *wmatch;
+		char	   *match;
+
+		patt = VARDATA_ANY(val);
+		pattlen = VARSIZE_ANY_EXHDR(val);
+		wpatt = palloc((pattlen + 1) * sizeof(pg_wchar));
+		wmatch = palloc((pattlen + 1) * sizeof(pg_wchar));
+		pg_mb2wchar_with_len(patt, wpatt, pattlen);
+
+		match = palloc(pattlen + 1);
+		for (pos = 0; pos < pattlen; pos++)
+		{
+			/* % and _ are wildcard characters in LIKE */
+			if (wpatt[pos] == '%' ||
+				wpatt[pos] == '_')
+				break;
+
+			/* Backslash escapes the next character */
+			if (wpatt[pos] == '\\')
+			{
+				pos++;
+				if (pos >= pattlen)
+					break;
+			}
+
+			/*
+			 * For ILIKE, stop if it's a case-varying character (it's sort of
+			 * a wildcard).
+			 */
+			if (case_insensitive && pg_iswcased(wpatt[pos], locale))
+				break;
+
+			wmatch[match_pos++] = wpatt[pos];
+		}
+
+		wmatch[match_pos] = '\0';
+
+		pg_wchar2mb_with_len(wmatch, match, pattlen);
+
+		pfree(wpatt);
+		pfree(wmatch);
+
+		*prefix_const = string_to_const(match, typeid);
 	}
 	else
 	{
 		bytea	   *bstr = DatumGetByteaPP(patt_const->constvalue);
+		char	   *match;
 
+		patt = VARDATA_ANY(bstr);
 		pattlen = VARSIZE_ANY_EXHDR(bstr);
-		patt = (char *) palloc(pattlen);
-		memcpy(patt, VARDATA_ANY(bstr), pattlen);
-		Assert((Pointer) bstr == DatumGetPointer(patt_const->constvalue));
-	}
 
-	match = palloc(pattlen + 1);
-	match_pos = 0;
-	for (pos = 0; pos < pattlen; pos++)
-	{
-		/* % and _ are wildcard characters in LIKE */
-		if (patt[pos] == '%' ||
-			patt[pos] == '_')
-			break;
-
-		/* Backslash escapes the next character */
-		if (patt[pos] == '\\')
+		match = palloc(pattlen + 1);
+		for (pos = 0; pos < pattlen; pos++)
 		{
-			pos++;
-			if (pos >= pattlen)
+			/* % and _ are wildcard characters in LIKE */
+			if (patt[pos] == '%' ||
+				patt[pos] == '_')
 				break;
-		}
 
-		/*
-		 * Stop if case-varying character (it's sort of a wildcard).
-		 *
-		 * In multibyte character sets or with non-libc providers, we can't
-		 * use isalpha, and it does not seem worth trying to convert to
-		 * wchar_t or char32_t.  Instead, just pass the single byte to the
-		 * provider, which will assume any non-ASCII char is potentially
-		 * case-varying.
-		 */
-		if (case_insensitive && char_is_cased(patt[pos], locale))
-			break;
-
-		match[match_pos++] = patt[pos];
-	}
+			/* Backslash escapes the next character */
+			if (patt[pos] == '\\')
+			{
+				pos++;
+				if (pos >= pattlen)
+					break;
+			}
 
-	match[match_pos] = '\0';
+			match[match_pos++] = pos;
+		}
 
-	if (typeid != BYTEAOID)
-		*prefix_const = string_to_const(match, typeid);
-	else
 		*prefix_const = string_to_bytea_const(match, match_pos);
 
+		pfree(match);
+	}
+
 	if (rest_selec != NULL)
 		*rest_selec = like_selectivity(&patt[pos], pattlen - pos,
 									   case_insensitive);
 
-	pfree(patt);
-	pfree(match);
-
 	/* in LIKE, an empty pattern is an exact match! */
 	if (pos == pattlen)
 		return Pattern_Prefix_Exact;	/* reached end of pattern, so exact */
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index b3afa6cad6c..6ec7a48f4c3 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1577,6 +1577,17 @@ pg_iswxdigit(pg_wchar wc, pg_locale_t locale)
 		return locale->ctype->wc_isxdigit(wc, locale);
 }
 
+bool
+pg_iswcased(pg_wchar wc, pg_locale_t locale)
+{
+	/* for the C locale, Cased and Alpha are equivalent */
+	if (locale->ctype == NULL)
+		return (wc <= (pg_wchar) 127 &&
+				(pg_char_properties[wc] & PG_ISALPHA));
+	else
+		return locale->ctype->wc_iscased(wc, locale);
+}
+
 pg_wchar
 pg_towupper(pg_wchar wc, pg_locale_t locale)
 {
@@ -1603,21 +1614,6 @@ pg_towlower(pg_wchar wc, pg_locale_t locale)
 		return locale->ctype->wc_tolower(wc, locale);
 }
 
-/*
- * char_is_cased()
- *
- * Fuzzy test of whether the given char is case-varying or not. The argument
- * is a single byte, so in a multibyte encoding, just assume any non-ASCII
- * char is case-varying.
- */
-bool
-char_is_cased(char ch, pg_locale_t locale)
-{
-	if (locale->ctype == NULL)
-		return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z');
-	return locale->ctype->char_is_cased(ch, locale);
-}
-
 /*
  * Return required encoding ID for the given locale, or -1 if any encoding is
  * valid for the locale.
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 1021e0d129b..0c2920112bb 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -186,10 +186,9 @@ wc_isxdigit_builtin(pg_wchar wc, pg_locale_t locale)
 }
 
 static bool
-char_is_cased_builtin(char ch, pg_locale_t locale)
+wc_iscased_builtin(pg_wchar wc, pg_locale_t locale)
 {
-	return IS_HIGHBIT_SET(ch) ||
-		(ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z');
+	return pg_u_prop_cased(to_char32(wc));
 }
 
 static pg_wchar
@@ -219,7 +218,7 @@ static const struct ctype_methods ctype_methods_builtin = {
 	.wc_ispunct = wc_ispunct_builtin,
 	.wc_isspace = wc_isspace_builtin,
 	.wc_isxdigit = wc_isxdigit_builtin,
-	.char_is_cased = char_is_cased_builtin,
+	.wc_iscased = wc_iscased_builtin,
 	.wc_tolower = wc_tolower_builtin,
 	.wc_toupper = wc_toupper_builtin,
 };
diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c
index f5a0cc8fe41..18d026deda8 100644
--- a/src/backend/utils/adt/pg_locale_icu.c
+++ b/src/backend/utils/adt/pg_locale_icu.c
@@ -121,13 +121,6 @@ static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity,
 									 const char *locale,
 									 UErrorCode *pErrorCode);
 
-static bool
-char_is_cased_icu(char ch, pg_locale_t locale)
-{
-	return IS_HIGHBIT_SET(ch) ||
-		(ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z');
-}
-
 /*
  * XXX: many of the functions below rely on casts directly from pg_wchar to
  * UChar32, which is correct for the UTF-8 encoding, but not in general.
@@ -223,6 +216,12 @@ wc_isxdigit_icu(pg_wchar wc, pg_locale_t locale)
 	return u_isxdigit(wc);
 }
 
+static bool
+wc_iscased_icu(pg_wchar wc, pg_locale_t locale)
+{
+	return u_hasBinaryProperty(wc, UCHAR_CASED);
+}
+
 static const struct ctype_methods ctype_methods_icu = {
 	.strlower = strlower_icu,
 	.strtitle = strtitle_icu,
@@ -238,7 +237,7 @@ static const struct ctype_methods ctype_methods_icu = {
 	.wc_ispunct = wc_ispunct_icu,
 	.wc_isspace = wc_isspace_icu,
 	.wc_isxdigit = wc_isxdigit_icu,
-	.char_is_cased = char_is_cased_icu,
+	.wc_iscased = wc_iscased_icu,
 	.wc_toupper = toupper_icu,
 	.wc_tolower = tolower_icu,
 };
diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c
index feb63bbdad1..4c20797ad5c 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -184,6 +184,13 @@ wc_isxdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
 #endif
 }
 
+static bool
+wc_iscased_libc_sb(pg_wchar wc, pg_locale_t locale)
+{
+	return isupper_l((unsigned char) wc, locale->lt) ||
+		islower_l((unsigned char) wc, locale->lt);
+}
+
 static bool
 wc_isdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
 {
@@ -249,14 +256,10 @@ wc_isxdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
 }
 
 static bool
-char_is_cased_libc(char ch, pg_locale_t locale)
+wc_iscased_libc_mb(pg_wchar wc, pg_locale_t locale)
 {
-	bool		is_multibyte = pg_database_encoding_max_length() > 1;
-
-	if (is_multibyte && IS_HIGHBIT_SET(ch))
-		return true;
-	else
-		return isalpha_l((unsigned char) ch, locale->lt);
+	return iswupper_l((wint_t) wc, locale->lt) ||
+		iswlower_l((wint_t) wc, locale->lt);
 }
 
 static pg_wchar
@@ -330,7 +333,7 @@ static const struct ctype_methods ctype_methods_libc_sb = {
 	.wc_ispunct = wc_ispunct_libc_sb,
 	.wc_isspace = wc_isspace_libc_sb,
 	.wc_isxdigit = wc_isxdigit_libc_sb,
-	.char_is_cased = char_is_cased_libc,
+	.wc_iscased = wc_iscased_libc_sb,
 	.wc_toupper = toupper_libc_sb,
 	.wc_tolower = tolower_libc_sb,
 };
@@ -354,7 +357,7 @@ static const struct ctype_methods ctype_methods_libc_other_mb = {
 	.wc_ispunct = wc_ispunct_libc_sb,
 	.wc_isspace = wc_isspace_libc_sb,
 	.wc_isxdigit = wc_isxdigit_libc_sb,
-	.char_is_cased = char_is_cased_libc,
+	.wc_iscased = wc_iscased_libc_sb,
 	.wc_toupper = toupper_libc_sb,
 	.wc_tolower = tolower_libc_sb,
 };
@@ -374,7 +377,7 @@ static const struct ctype_methods ctype_methods_libc_utf8 = {
 	.wc_ispunct = wc_ispunct_libc_mb,
 	.wc_isspace = wc_isspace_libc_mb,
 	.wc_isxdigit = wc_isxdigit_libc_mb,
-	.char_is_cased = char_is_cased_libc,
+	.wc_iscased = wc_iscased_libc_mb,
 	.wc_toupper = toupper_libc_mb,
 	.wc_tolower = tolower_libc_mb,
 };
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index e5aaf6422e8..6dda56d1c3c 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -119,11 +119,9 @@ struct ctype_methods
 	bool		(*wc_ispunct) (pg_wchar wc, pg_locale_t locale);
 	bool		(*wc_isspace) (pg_wchar wc, pg_locale_t locale);
 	bool		(*wc_isxdigit) (pg_wchar wc, pg_locale_t locale);
+	bool		(*wc_iscased) (pg_wchar wc, pg_locale_t locale);
 	pg_wchar	(*wc_toupper) (pg_wchar wc, pg_locale_t locale);
 	pg_wchar	(*wc_tolower) (pg_wchar wc, pg_locale_t locale);
-
-	/* required */
-	bool		(*char_is_cased) (char ch, pg_locale_t locale);
 };
 
 /*
@@ -211,6 +209,7 @@ extern bool pg_iswprint(pg_wchar wc, pg_locale_t locale);
 extern bool pg_iswpunct(pg_wchar wc, pg_locale_t locale);
 extern bool pg_iswspace(pg_wchar wc, pg_locale_t locale);
 extern bool pg_iswxdigit(pg_wchar wc, pg_locale_t locale);
+extern bool pg_iswcased(pg_wchar wc, pg_locale_t locale);
 extern pg_wchar pg_towupper(pg_wchar wc, pg_locale_t locale);
 extern pg_wchar pg_towlower(pg_wchar wc, pg_locale_t locale);
 
-- 
2.43.0