v10-0008-Use-multibyte-aware-extraction-of-pattern-prefix.patch
text/x-patch
Filename: v10-0008-Use-multibyte-aware-extraction-of-pattern-prefix.patch
Type: text/x-patch
Part: 7
From 94a0e519f13cfc8554d11cf46ed7bbef8aad2ed3 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Fri, 21 Nov 2025 12:14:21 -0800
Subject: [PATCH v10 08/11] Use multibyte-aware extraction of pattern prefixes.
Previously, like_fixed_prefix() used char-at-a-time logic, which
forced it to be too conservative for case-insensitive matching.
Now, use pg_wchar-at-a-time loop for text types, along with proper
detection of cased characters; and preserve and char-at-a-time logic
for bytea.
Removes the pg_locale_t char_is_cased() single-byte method and
replaces it with a proper multibyte pg_iswcased() method.
Discussion: https://postgr.es/m/450ceb6260cad30d7afdf155d991a9caafee7c0d.camel@j-davis.com
---
src/backend/utils/adt/like_support.c | 111 +++++++++++++---------
src/backend/utils/adt/pg_locale.c | 26 +++--
src/backend/utils/adt/pg_locale_builtin.c | 7 +-
src/backend/utils/adt/pg_locale_icu.c | 15 ++-
src/backend/utils/adt/pg_locale_libc.c | 23 +++--
src/include/utils/pg_locale.h | 5 +-
6 files changed, 103 insertions(+), 84 deletions(-)
diff --git a/src/backend/utils/adt/like_support.c b/src/backend/utils/adt/like_support.c
index 0debccfa67b..e7255fa652a 100644
--- a/src/backend/utils/adt/like_support.c
+++ b/src/backend/utils/adt/like_support.c
@@ -987,12 +987,11 @@ static Pattern_Prefix_Status
like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
Const **prefix_const, Selectivity *rest_selec)
{
- char *match;
char *patt;
int pattlen;
Oid typeid = patt_const->consttype;
- int pos,
- match_pos;
+ int pos;
+ int match_pos = 0;
pg_locale_t locale = 0;
/* the right-hand const is type text or bytea */
@@ -1020,67 +1019,91 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
locale = pg_newlocale_from_collation(collation);
}
+ /* for text types, use pg_wchar; for BYTEA, use char */
if (typeid != BYTEAOID)
{
- patt = TextDatumGetCString(patt_const->constvalue);
- pattlen = strlen(patt);
+ text *val = DatumGetTextPP(patt_const->constvalue);
+ pg_wchar *wpatt;
+ pg_wchar *wmatch;
+ char *match;
+
+ patt = VARDATA_ANY(val);
+ pattlen = VARSIZE_ANY_EXHDR(val);
+ wpatt = palloc((pattlen + 1) * sizeof(pg_wchar));
+ wmatch = palloc((pattlen + 1) * sizeof(pg_wchar));
+ pg_mb2wchar_with_len(patt, wpatt, pattlen);
+
+ match = palloc(pattlen + 1);
+ for (pos = 0; pos < pattlen; pos++)
+ {
+ /* % and _ are wildcard characters in LIKE */
+ if (wpatt[pos] == '%' ||
+ wpatt[pos] == '_')
+ break;
+
+ /* Backslash escapes the next character */
+ if (wpatt[pos] == '\\')
+ {
+ pos++;
+ if (pos >= pattlen)
+ break;
+ }
+
+ /*
+ * For ILIKE, stop if it's a case-varying character (it's sort of
+ * a wildcard).
+ */
+ if (case_insensitive && pg_iswcased(wpatt[pos], locale))
+ break;
+
+ wmatch[match_pos++] = wpatt[pos];
+ }
+
+ wmatch[match_pos] = '\0';
+
+ pg_wchar2mb_with_len(wmatch, match, pattlen);
+
+ pfree(wpatt);
+ pfree(wmatch);
+
+ *prefix_const = string_to_const(match, typeid);
}
else
{
bytea *bstr = DatumGetByteaPP(patt_const->constvalue);
+ char *match;
+ patt = VARDATA_ANY(bstr);
pattlen = VARSIZE_ANY_EXHDR(bstr);
- patt = (char *) palloc(pattlen);
- memcpy(patt, VARDATA_ANY(bstr), pattlen);
- Assert((Pointer) bstr == DatumGetPointer(patt_const->constvalue));
- }
- match = palloc(pattlen + 1);
- match_pos = 0;
- for (pos = 0; pos < pattlen; pos++)
- {
- /* % and _ are wildcard characters in LIKE */
- if (patt[pos] == '%' ||
- patt[pos] == '_')
- break;
-
- /* Backslash escapes the next character */
- if (patt[pos] == '\\')
+ match = palloc(pattlen + 1);
+ for (pos = 0; pos < pattlen; pos++)
{
- pos++;
- if (pos >= pattlen)
+ /* % and _ are wildcard characters in LIKE */
+ if (patt[pos] == '%' ||
+ patt[pos] == '_')
break;
- }
- /*
- * Stop if case-varying character (it's sort of a wildcard).
- *
- * In multibyte character sets or with non-libc providers, we can't
- * use isalpha, and it does not seem worth trying to convert to
- * wchar_t or char32_t. Instead, just pass the single byte to the
- * provider, which will assume any non-ASCII char is potentially
- * case-varying.
- */
- if (case_insensitive && char_is_cased(patt[pos], locale))
- break;
-
- match[match_pos++] = patt[pos];
- }
+ /* Backslash escapes the next character */
+ if (patt[pos] == '\\')
+ {
+ pos++;
+ if (pos >= pattlen)
+ break;
+ }
- match[match_pos] = '\0';
+ match[match_pos++] = pos;
+ }
- if (typeid != BYTEAOID)
- *prefix_const = string_to_const(match, typeid);
- else
*prefix_const = string_to_bytea_const(match, match_pos);
+ pfree(match);
+ }
+
if (rest_selec != NULL)
*rest_selec = like_selectivity(&patt[pos], pattlen - pos,
case_insensitive);
- pfree(patt);
- pfree(match);
-
/* in LIKE, an empty pattern is an exact match! */
if (pos == pattlen)
return Pattern_Prefix_Exact; /* reached end of pattern, so exact */
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 5aba277ba99..c4e89502f85 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1588,6 +1588,17 @@ pg_iswxdigit(pg_wchar wc, pg_locale_t locale)
return locale->ctype->wc_isxdigit(wc, locale);
}
+bool
+pg_iswcased(pg_wchar wc, pg_locale_t locale)
+{
+ /* for the C locale, Cased and Alpha are equivalent */
+ if (locale->ctype == NULL)
+ return (wc <= (pg_wchar) 127 &&
+ (pg_char_properties[wc] & PG_ISALPHA));
+ else
+ return locale->ctype->wc_iscased(wc, locale);
+}
+
pg_wchar
pg_towupper(pg_wchar wc, pg_locale_t locale)
{
@@ -1614,21 +1625,6 @@ pg_towlower(pg_wchar wc, pg_locale_t locale)
return locale->ctype->wc_tolower(wc, locale);
}
-/*
- * char_is_cased()
- *
- * Fuzzy test of whether the given char is case-varying or not. The argument
- * is a single byte, so in a multibyte encoding, just assume any non-ASCII
- * char is case-varying.
- */
-bool
-char_is_cased(char ch, pg_locale_t locale)
-{
- if (locale->ctype == NULL)
- return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z');
- return locale->ctype->char_is_cased(ch, locale);
-}
-
/*
* Return required encoding ID for the given locale, or -1 if any encoding is
* valid for the locale.
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 1021e0d129b..0c2920112bb 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -186,10 +186,9 @@ wc_isxdigit_builtin(pg_wchar wc, pg_locale_t locale)
}
static bool
-char_is_cased_builtin(char ch, pg_locale_t locale)
+wc_iscased_builtin(pg_wchar wc, pg_locale_t locale)
{
- return IS_HIGHBIT_SET(ch) ||
- (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z');
+ return pg_u_prop_cased(to_char32(wc));
}
static pg_wchar
@@ -219,7 +218,7 @@ static const struct ctype_methods ctype_methods_builtin = {
.wc_ispunct = wc_ispunct_builtin,
.wc_isspace = wc_isspace_builtin,
.wc_isxdigit = wc_isxdigit_builtin,
- .char_is_cased = char_is_cased_builtin,
+ .wc_iscased = wc_iscased_builtin,
.wc_tolower = wc_tolower_builtin,
.wc_toupper = wc_toupper_builtin,
};
diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c
index f5a0cc8fe41..18d026deda8 100644
--- a/src/backend/utils/adt/pg_locale_icu.c
+++ b/src/backend/utils/adt/pg_locale_icu.c
@@ -121,13 +121,6 @@ static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity,
const char *locale,
UErrorCode *pErrorCode);
-static bool
-char_is_cased_icu(char ch, pg_locale_t locale)
-{
- return IS_HIGHBIT_SET(ch) ||
- (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z');
-}
-
/*
* XXX: many of the functions below rely on casts directly from pg_wchar to
* UChar32, which is correct for the UTF-8 encoding, but not in general.
@@ -223,6 +216,12 @@ wc_isxdigit_icu(pg_wchar wc, pg_locale_t locale)
return u_isxdigit(wc);
}
+static bool
+wc_iscased_icu(pg_wchar wc, pg_locale_t locale)
+{
+ return u_hasBinaryProperty(wc, UCHAR_CASED);
+}
+
static const struct ctype_methods ctype_methods_icu = {
.strlower = strlower_icu,
.strtitle = strtitle_icu,
@@ -238,7 +237,7 @@ static const struct ctype_methods ctype_methods_icu = {
.wc_ispunct = wc_ispunct_icu,
.wc_isspace = wc_isspace_icu,
.wc_isxdigit = wc_isxdigit_icu,
- .char_is_cased = char_is_cased_icu,
+ .wc_iscased = wc_iscased_icu,
.wc_toupper = toupper_icu,
.wc_tolower = tolower_icu,
};
diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c
index 545ee9a3099..fa419863fa7 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -184,6 +184,13 @@ wc_isxdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
#endif
}
+static bool
+wc_iscased_libc_sb(pg_wchar wc, pg_locale_t locale)
+{
+ return isupper_l((unsigned char) wc, locale->lt) ||
+ islower_l((unsigned char) wc, locale->lt);
+}
+
static bool
wc_isdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
{
@@ -249,14 +256,10 @@ wc_isxdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
}
static bool
-char_is_cased_libc(char ch, pg_locale_t locale)
+wc_iscased_libc_mb(pg_wchar wc, pg_locale_t locale)
{
- bool is_multibyte = pg_database_encoding_max_length() > 1;
-
- if (is_multibyte && IS_HIGHBIT_SET(ch))
- return true;
- else
- return isalpha_l((unsigned char) ch, locale->lt);
+ return iswupper_l((wint_t) wc, locale->lt) ||
+ iswlower_l((wint_t) wc, locale->lt);
}
static pg_wchar
@@ -331,7 +334,7 @@ static const struct ctype_methods ctype_methods_libc_sb = {
.wc_ispunct = wc_ispunct_libc_sb,
.wc_isspace = wc_isspace_libc_sb,
.wc_isxdigit = wc_isxdigit_libc_sb,
- .char_is_cased = char_is_cased_libc,
+ .wc_iscased = wc_iscased_libc_sb,
.wc_toupper = toupper_libc_sb,
.wc_tolower = tolower_libc_sb,
};
@@ -356,7 +359,7 @@ static const struct ctype_methods ctype_methods_libc_other_mb = {
.wc_ispunct = wc_ispunct_libc_sb,
.wc_isspace = wc_isspace_libc_sb,
.wc_isxdigit = wc_isxdigit_libc_sb,
- .char_is_cased = char_is_cased_libc,
+ .wc_iscased = wc_iscased_libc_sb,
.wc_toupper = toupper_libc_sb,
.wc_tolower = tolower_libc_sb,
};
@@ -377,7 +380,7 @@ static const struct ctype_methods ctype_methods_libc_utf8 = {
.wc_ispunct = wc_ispunct_libc_mb,
.wc_isspace = wc_isspace_libc_mb,
.wc_isxdigit = wc_isxdigit_libc_mb,
- .char_is_cased = char_is_cased_libc,
+ .wc_iscased = wc_iscased_libc_mb,
.wc_toupper = toupper_libc_mb,
.wc_tolower = tolower_libc_mb,
};
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index 50520e50127..01f891def7a 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -122,11 +122,9 @@ struct ctype_methods
bool (*wc_ispunct) (pg_wchar wc, pg_locale_t locale);
bool (*wc_isspace) (pg_wchar wc, pg_locale_t locale);
bool (*wc_isxdigit) (pg_wchar wc, pg_locale_t locale);
+ bool (*wc_iscased) (pg_wchar wc, pg_locale_t locale);
pg_wchar (*wc_toupper) (pg_wchar wc, pg_locale_t locale);
pg_wchar (*wc_tolower) (pg_wchar wc, pg_locale_t locale);
-
- /* required */
- bool (*char_is_cased) (char ch, pg_locale_t locale);
};
/*
@@ -214,6 +212,7 @@ extern bool pg_iswprint(pg_wchar wc, pg_locale_t locale);
extern bool pg_iswpunct(pg_wchar wc, pg_locale_t locale);
extern bool pg_iswspace(pg_wchar wc, pg_locale_t locale);
extern bool pg_iswxdigit(pg_wchar wc, pg_locale_t locale);
+extern bool pg_iswcased(pg_wchar wc, pg_locale_t locale);
extern pg_wchar pg_towupper(pg_wchar wc, pg_locale_t locale);
extern pg_wchar pg_towlower(pg_wchar wc, pg_locale_t locale);
--
2.43.0