v5-0001-Fix-greedy-substring-search-for-non-deterministic.patch
text/x-patch
Filename: v5-0001-Fix-greedy-substring-search-for-non-deterministic.patch
Type: text/x-patch
Part: 0
From e265fc5790d4c8224bb758518470b2424418d913 Mon Sep 17 00:00:00 2001
From: Laurenz Albe <laurenz.albe@cybertec.at>
Date: Thu, 4 Dec 2025 18:07:06 +0100
Subject: [PATCH v5] Fix greedy substring search for non-deterministic
collations
Due to an off-by-one error, the code failed to find matches at the
end of the haystack. Fix by rewriting the loop.
While at it, fix a comment that claimed that the function could find
a zero-length match. Such a match could send a caller into an
endless loop. However, zero-length matches can only occur with an
empty search string, and that case is explicitly excluded by all
callers. To make sure it stays that way, add an Assert.
Author: Laurenz Albe <laurenz.albe@cybertec.at>
Reported-By: Adam Warland <adam.warland@infor.com>
Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi>
Reviewed-By: Tom Lane <tgl@sss.pgh.pa.us>
Discussion: https://postgr.es/m/19341-1d9a22915edfec58%40postgresql.org
---
src/backend/utils/adt/varlena.c | 24 ++++++++++++-------
.../regress/expected/collate.icu.utf8.out | 7 ++++++
src/test/regress/sql/collate.icu.utf8.sql | 3 +++
3 files changed, 25 insertions(+), 9 deletions(-)
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 3894457ab40..26fbb9e969b 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -1110,10 +1110,10 @@ text_position_next_internal(char *start_ptr, TextPositionState *state)
const char *haystack_end = &haystack[haystack_len];
const char *hptr;
- Assert(start_ptr >= haystack && start_ptr <= haystack_end);
-
state->last_match_len_tmp = needle_len;
+ Assert(start_ptr >= haystack && start_ptr <= haystack_end && needle_len > 0);
+
if (!state->locale->deterministic)
{
/*
@@ -1123,9 +1123,9 @@ text_position_next_internal(char *start_ptr, TextPositionState *state)
* needle under the given collation.
*
* Note, the found substring could have a different length than the
- * needle, including being empty. Callers that want to skip over the
- * found string need to read the length of the found substring from
- * last_match_len rather than just using the length of their needle.
+ * needle. Callers that want to skip over the found string need to
+ * read the length of the found substring from last_match_len rather
+ * than just using the length of their needle.
*
* Most callers will require "greedy" semantics, meaning that we need
* to find the longest such substring, not the shortest. For callers
@@ -1136,6 +1136,8 @@ text_position_next_internal(char *start_ptr, TextPositionState *state)
hptr = start_ptr;
while (hptr < haystack_end)
{
+ const char *test_end;
+
/*
* First check the common case that there is a match in the
* haystack of exactly the length of the needle.
@@ -1146,11 +1148,14 @@ text_position_next_internal(char *start_ptr, TextPositionState *state)
return (char *) hptr;
/*
- * Else check if any of the possible substrings starting at hptr
- * are equal to the needle.
+ * Else check if any of the possible non-empty substrings starting
+ * at hptr are equal to the needle.
*/
- for (const char *test_end = hptr; test_end < haystack_end; test_end += pg_mblen(test_end))
+ test_end = hptr;
+ do
{
+ test_end += pg_mblen(test_end);
+
if (pg_strncoll(hptr, (test_end - hptr), needle, needle_len, state->locale) == 0)
{
state->last_match_len_tmp = (test_end - hptr);
@@ -1158,7 +1163,8 @@ text_position_next_internal(char *start_ptr, TextPositionState *state)
if (!state->greedy)
break;
}
- }
+ } while (test_end < haystack_end);
+
if (result_hptr)
break;
diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out
index b8579a1efc6..0a14c1d93ff 100644
--- a/src/test/regress/expected/collate.icu.utf8.out
+++ b/src/test/regress/expected/collate.icu.utf8.out
@@ -1484,6 +1484,13 @@ SELECT array_sort('{a,B}'::text[] COLLATE "C");
{B,a}
(1 row)
+-- test replace() at the end of the string (bug #19341)
+SELECT replace('testX' COLLATE case_insensitive, 'x' COLLATE case_insensitive, 'es');
+ replace
+---------
+ testes
+(1 row)
+
-- test language tags
CREATE COLLATION lt_insensitive (provider = icu, locale = 'en-u-ks-level1', deterministic = false);
SELECT 'aBcD' COLLATE lt_insensitive = 'AbCd' COLLATE lt_insensitive;
diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql
index 6f5abac0dc0..5e3fef9b381 100644
--- a/src/test/regress/sql/collate.icu.utf8.sql
+++ b/src/test/regress/sql/collate.icu.utf8.sql
@@ -568,6 +568,9 @@ SELECT 'abc' <= 'ABC' COLLATE case_insensitive, 'abc' >= 'ABC' COLLATE case_inse
SELECT array_sort('{a,B}'::text[] COLLATE case_insensitive);
SELECT array_sort('{a,B}'::text[] COLLATE "C");
+-- test replace() at the end of the string (bug #19341)
+SELECT replace('testX' COLLATE case_insensitive, 'x' COLLATE case_insensitive, 'es');
+
-- test language tags
CREATE COLLATION lt_insensitive (provider = icu, locale = 'en-u-ks-level1', deterministic = false);
SELECT 'aBcD' COLLATE lt_insensitive = 'AbCd' COLLATE lt_insensitive;
--
2.52.0