v13-0001-fuzzystrmatch-use-pg_ascii_toupper.patch
text/x-patch
Filename: v13-0001-fuzzystrmatch-use-pg_ascii_toupper.patch
Type: text/x-patch
Part: 0
From 8161ca49ae2044e004d3f36c04f60b03e97f4071 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Wed, 19 Nov 2025 13:24:38 -0800
Subject: [PATCH v13 1/2] fuzzystrmatch: use pg_ascii_toupper().
fuzzystrmatch is designed for ASCII, so no need to rely on the global
LC_CTYPE setting.
TODO: what about \xc7 case? Also, what should the behavior be for
soundex()?
Discussion: https://postgr.es/m/450ceb6260cad30d7afdf155d991a9caafee7c0d.camel@j-davis.com
---
contrib/fuzzystrmatch/dmetaphone.c | 45 +++++++++++++++++++++++++--
contrib/fuzzystrmatch/fuzzystrmatch.c | 43 ++++++++++++++-----------
2 files changed, 67 insertions(+), 21 deletions(-)
diff --git a/contrib/fuzzystrmatch/dmetaphone.c b/contrib/fuzzystrmatch/dmetaphone.c
index 227d8b11ddc..9a4e5ae7e0e 100644
--- a/contrib/fuzzystrmatch/dmetaphone.c
+++ b/contrib/fuzzystrmatch/dmetaphone.c
@@ -98,6 +98,7 @@ The remaining code is authored by Andrew Dunstan <amdunstan@ncshp.org> and
#include "postgres.h"
+#include "mb/pg_wchar.h"
#include "utils/builtins.h"
/* turn off assertions for embedded function */
@@ -116,6 +117,9 @@ The remaining code is authored by Andrew Dunstan <amdunstan@ncshp.org> and
#include <assert.h>
#include <ctype.h>
+#define SMALL_LETTER_C_WITH_CEDILLA '\xe7'
+#define CAPITAL_LETTER_C_WITH_CEDILLA '\xc7'
+
/* prototype for the main function we got from the perl module */
static void DoubleMetaphone(char *str, char **codes);
@@ -282,9 +286,46 @@ static void
MakeUpper(metastring *s)
{
char *i;
+ bool c_with_cedilla;
+
+ /*
+ * C WITH CEDILLA should be uppercased, as well.
+ *
+ * XXX: Only works in single-byte encodings that encode lowercase C WITH
+ * CEDILLA as \xe7. Should have proper multibyte support.
+ *
+ * NB: WIN1256 encodes only the lowercase C WITH CEDILLA, but for the
+ * purposes of metaphone, we can still "uppercase" it to \xc7 here so that
+ * it's recognized later.
+ */
+ switch (GetDatabaseEncoding())
+ {
+ case PG_LATIN1:
+ case PG_LATIN2:
+ case PG_LATIN3:
+ case PG_LATIN5:
+ case PG_LATIN8:
+ case PG_LATIN9:
+ case PG_LATIN10:
+ case PG_WIN1250:
+ case PG_WIN1252:
+ case PG_WIN1254:
+ case PG_WIN1256:
+ case PG_WIN1258:
+ c_with_cedilla = true;
+ break;
+ default:
+ c_with_cedilla = false;
+ break;
+ }
for (i = s->str; *i; i++)
- *i = toupper((unsigned char) *i);
+ {
+ if (c_with_cedilla && *i == SMALL_LETTER_C_WITH_CEDILLA)
+ *i = CAPITAL_LETTER_C_WITH_CEDILLA;
+ else
+ *i = pg_ascii_toupper((unsigned char) *i);
+ }
}
@@ -463,7 +504,7 @@ DoubleMetaphone(char *str, char **codes)
current += 1;
break;
- case '\xc7': /* C with cedilla */
+ case CAPITAL_LETTER_C_WITH_CEDILLA:
MetaphAdd(primary, "S");
MetaphAdd(secondary, "S");
current += 1;
diff --git a/contrib/fuzzystrmatch/fuzzystrmatch.c b/contrib/fuzzystrmatch/fuzzystrmatch.c
index e7cc314b763..319302af0e4 100644
--- a/contrib/fuzzystrmatch/fuzzystrmatch.c
+++ b/contrib/fuzzystrmatch/fuzzystrmatch.c
@@ -62,7 +62,7 @@ static const char *const soundex_table = "01230120022455012623010202";
static char
soundex_code(char letter)
{
- letter = toupper((unsigned char) letter);
+ letter = pg_ascii_toupper((unsigned char) letter);
/* Defend against non-ASCII letters */
if (letter >= 'A' && letter <= 'Z')
return soundex_table[letter - 'A'];
@@ -122,16 +122,21 @@ static const char _codes[26] = {
static int
getcode(char c)
{
- if (isalpha((unsigned char) c))
- {
- c = toupper((unsigned char) c);
- /* Defend against non-ASCII letters */
- if (c >= 'A' && c <= 'Z')
- return _codes[c - 'A'];
- }
+ c = pg_ascii_toupper((unsigned char) c);
+ /* Defend against non-ASCII letters */
+ if (c >= 'A' && c <= 'Z')
+ return _codes[c - 'A'];
+
return 0;
}
+static bool
+ascii_isalpha(char c)
+{
+ return (c >= 'A' && c <= 'Z') ||
+ (c >= 'a' && c <= 'z');
+}
+
#define isvowel(c) (getcode(c) & 1) /* AEIOU */
/* These letters are passed through unchanged */
@@ -301,18 +306,18 @@ metaphone(PG_FUNCTION_ARGS)
* accessing the array directly... */
/* Look at the next letter in the word */
-#define Next_Letter (toupper((unsigned char) word[w_idx+1]))
+#define Next_Letter (pg_ascii_toupper((unsigned char) word[w_idx+1]))
/* Look at the current letter in the word */
-#define Curr_Letter (toupper((unsigned char) word[w_idx]))
+#define Curr_Letter (pg_ascii_toupper((unsigned char) word[w_idx]))
/* Go N letters back. */
#define Look_Back_Letter(n) \
- (w_idx >= (n) ? toupper((unsigned char) word[w_idx-(n)]) : '\0')
+ (w_idx >= (n) ? pg_ascii_toupper((unsigned char) word[w_idx-(n)]) : '\0')
/* Previous letter. I dunno, should this return null on failure? */
#define Prev_Letter (Look_Back_Letter(1))
/* Look two letters down. It makes sure you don't walk off the string. */
#define After_Next_Letter \
- (Next_Letter != '\0' ? toupper((unsigned char) word[w_idx+2]) : '\0')
-#define Look_Ahead_Letter(n) toupper((unsigned char) Lookahead(word+w_idx, n))
+ (Next_Letter != '\0' ? pg_ascii_toupper((unsigned char) word[w_idx+2]) : '\0')
+#define Look_Ahead_Letter(n) pg_ascii_toupper((unsigned char) Lookahead(word+w_idx, n))
/* Allows us to safely look ahead an arbitrary # of letters */
@@ -340,7 +345,7 @@ Lookahead(char *word, int how_far)
#define Phone_Len (p_idx)
/* Note is a letter is a 'break' in the word */
-#define Isbreak(c) (!isalpha((unsigned char) (c)))
+#define Isbreak(c) (!ascii_isalpha((unsigned char) (c)))
static void
@@ -379,7 +384,7 @@ _metaphone(char *word, /* IN */
/*-- The first phoneme has to be processed specially. --*/
/* Find our first letter */
- for (; !isalpha((unsigned char) (Curr_Letter)); w_idx++)
+ for (; !ascii_isalpha((unsigned char) (Curr_Letter)); w_idx++)
{
/* On the off chance we were given nothing but crap... */
if (Curr_Letter == '\0')
@@ -478,7 +483,7 @@ _metaphone(char *word, /* IN */
*/
/* Ignore non-alphas */
- if (!isalpha((unsigned char) (Curr_Letter)))
+ if (!ascii_isalpha((unsigned char) (Curr_Letter)))
continue;
/* Drop duplicates, except CC */
@@ -731,7 +736,7 @@ _soundex(const char *instr, char *outstr)
Assert(outstr);
/* Skip leading non-alphabetic characters */
- while (*instr && !isalpha((unsigned char) *instr))
+ while (*instr && !ascii_isalpha((unsigned char) *instr))
++instr;
/* If no string left, return all-zeroes buffer */
@@ -742,12 +747,12 @@ _soundex(const char *instr, char *outstr)
}
/* Take the first letter as is */
- *outstr++ = (char) toupper((unsigned char) *instr++);
+ *outstr++ = (char) pg_ascii_toupper((unsigned char) *instr++);
count = 1;
while (*instr && count < SOUNDEX_LEN)
{
- if (isalpha((unsigned char) *instr) &&
+ if (ascii_isalpha((unsigned char) *instr) &&
soundex_code(*instr) != soundex_code(*(instr - 1)))
{
*outstr = soundex_code(*instr);
--
2.43.0