v1-0001-Add-xicorr-X-Y.patch
application/octet-stream
Filename: v1-0001-Add-xicorr-X-Y.patch
Type: application/octet-stream
Part: 0
From 11a33fba2d02f3f76f1202735f87f4e27416253b Mon Sep 17 00:00:00 2001
From: Florents Tselai <florents.tselai@gmail.com>
Date: Sun, 31 Aug 2025 15:23:18 +0300
Subject: [PATCH v1] Add xicorr(X, Y)
---
doc/src/sgml/func/func-aggregate.sgml | 28 ++++
src/backend/utils/adt/float.c | 156 +++++++++++++++++++++++
src/include/catalog/pg_aggregate.dat | 2 +
src/include/catalog/pg_proc.dat | 9 ++
src/test/regress/expected/aggregates.out | 10 ++
src/test/regress/sql/aggregates.sql | 5 +
6 files changed, 210 insertions(+)
diff --git a/doc/src/sgml/func/func-aggregate.sgml b/doc/src/sgml/func/func-aggregate.sgml
index f50b692516b..4de10a5cb30 100644
--- a/doc/src/sgml/func/func-aggregate.sgml
+++ b/doc/src/sgml/func/func-aggregate.sgml
@@ -1091,6 +1091,34 @@ SELECT count(*) FROM sometable;
</para></entry>
<entry>Yes</entry>
</row>
+
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>xi correlation</primary>
+ </indexterm>
+ <indexterm>
+ <primary>xicorr</primary>
+ </indexterm>
+ <function>xicorr</function> ( <parameter>Y</parameter> <type>double precision</type>, <parameter>X</parameter> <type>double precision</type> )
+ <returnvalue>double precision</returnvalue>
+ </para>
+ <para>
+ Computes Chaterjee's Χi (ξ) correlation coefficient.
+ The xi correlation coefficient is a measure of association between two variables;
+ the value tends to be close to zero when the variables
+ are independent and close to 1 when there is a strong association.
+ Unlike other correlation coefficients,
+ the xi correlation is effective even when the association is not monotonic.
+ Note that this statistic is non-symmetric by design.
+ Tied values of <replaceable>x</replaceable> are ordered
+ deterministically by their <replaceable>y</replaceable>
+ values. This differs from some software which breaks ties arbitrarily.
+ With small samples, the sample coefficient can be slightly negative.
+ This implementation clamps such values to 0.
+ </para></entry>
+ <entry>Yes</entry>
+ </row>
</tbody>
</tgroup>
</table>
diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c
index 7b97d2be6ca..990430c4c53 100644
--- a/src/backend/utils/adt/float.c
+++ b/src/backend/utils/adt/float.c
@@ -3754,6 +3754,162 @@ float8_corr(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT8(Sxy / sqrt(Sxx * Syy));
}
+/* xicorr(X, Y) infrastructure */
+
+typedef struct XiState
+{
+ int n; /* number of rows */
+ int cap; /* allocated capacity */
+ double *xs;
+ double *ys;
+} XiState;
+
+Datum
+xicorr_transfn(PG_FUNCTION_ARGS)
+{
+ MemoryContext aggctx;
+ XiState *st;
+ int newcap;
+ double x = PG_GETARG_FLOAT8(1);
+ double y = PG_GETARG_FLOAT8(2);
+
+ if (!AggCheckCallContext(fcinfo, &aggctx))
+ elog(ERROR, "must be called as aggregate");
+
+ if (PG_ARGISNULL(0))
+ {
+ /* first call */
+ st = (XiState *) MemoryContextAllocZero(aggctx, sizeof(XiState));
+ st->cap = 64;
+ st->xs = (double *) MemoryContextAlloc(aggctx, sizeof(double)*st->cap);
+ st->ys = (double *) MemoryContextAlloc(aggctx, sizeof(double)*st->cap);
+ st->n = 0;
+ }
+ else
+ {
+ st = (XiState *) PG_GETARG_POINTER(0);
+ }
+
+ if (PG_ARGISNULL(1) || PG_ARGISNULL(2))
+ PG_RETURN_POINTER(st);
+
+ /* grow if needed */
+ if (st->n >= st->cap)
+ {
+ newcap = st->cap * 2;
+ st->xs = (double *) repalloc(st->xs, sizeof(double)*newcap);
+ st->ys = (double *) repalloc(st->ys, sizeof(double)*newcap);
+ st->cap = newcap;
+ }
+
+ st->xs[st->n] = x;
+ st->ys[st->n] = y;
+ st->n++;
+
+ PG_RETURN_POINTER(st);
+}
+
+/* Compare indices by Y */
+static int
+cmp_y(const void *a, const void *b, void *arg)
+{
+ const XiState *st = (const XiState *) arg;
+ int ia = *(const int *)a;
+ int ib = *(const int *)b;
+
+ if (st->ys[ia] < st->ys[ib]) return -1;
+ if (st->ys[ia] > st->ys[ib]) return 1;
+ return 0;
+}
+
+/* Compare indices by X (break ties with Y) */
+static int
+cmp_x_then_y(const void *a, const void *b, void *arg)
+{
+ const XiState *st = (const XiState *) arg;
+ int ia = *(const int *)a;
+ int ib = *(const int *)b;
+
+ if (st->xs[ia] < st->xs[ib]) return -1;
+ if (st->xs[ia] > st->xs[ib]) return 1;
+
+ if (st->ys[ia] < st->ys[ib]) return -1;
+ if (st->ys[ia] > st->ys[ib]) return 1;
+
+ return 0;
+}
+
+Datum
+float8_xicorr(PG_FUNCTION_ARGS)
+{
+ XiState *st;
+ int n, j;
+ int *ix, *iy;
+ double *rankY;
+ double avg, S, denom, xi;
+
+ if (PG_ARGISNULL(0))
+ PG_RETURN_NULL();
+
+ st = (XiState *) PG_GETARG_POINTER(0);
+ n = st->n;
+
+ if (n < 2)
+ PG_RETURN_NULL();
+
+ /* --- 1. rank Y --- */
+ iy = palloc(sizeof(int) * n);
+ for (int i = 0; i < n; i++) iy[i] = i;
+
+ qsort_arg(iy, n, sizeof(int), cmp_y, st);
+
+ rankY = palloc(sizeof(double) * n);
+
+ for (int i = 0; i < n; )
+ {
+ j = i + 1;
+ while (j < n && st->ys[iy[j]] == st->ys[iy[i]])
+ j++;
+
+ /* average rank for ties */
+ avg = ((double)(i + 1) + (double)j) / 2.0;
+ for (int k = i; k < j; k++)
+ rankY[iy[k]] = avg;
+
+ i = j;
+ }
+
+ /* --- 2. order indices by X (break ties by Y) --- */
+ ix = palloc(sizeof(int) * n);
+ for (int i = 0; i < n; i++) ix[i] = i;
+
+ qsort_arg(ix, n, sizeof(int), cmp_x_then_y, st);
+
+ /* --- 3. sum successive rank differences --- */
+ S = 0.0;
+ for (int t = 0; t < n - 1; t++)
+ {
+ int i0 = ix[t];
+ int i1 = ix[t + 1];
+ S += fabsl(rankY[i1] - rankY[i0]);
+ }
+
+ /* --- 4. coefficient --- */
+ denom = (double)n * (double)n - 1.0L;
+ if (denom <= 0.0L)
+ PG_RETURN_NULL();
+
+ xi = 1.0 - (3.0 * S) / denom;
+
+ /* Clamp to [0,1] if desired */
+ if (xi < 0.0)
+ xi = 0.0;
+ else if (xi > 1.0)
+ xi = 1.0;
+
+ PG_RETURN_FLOAT8(xi);
+}
+
Datum
float8_regr_r2(PG_FUNCTION_ARGS)
{
diff --git a/src/include/catalog/pg_aggregate.dat b/src/include/catalog/pg_aggregate.dat
index d6aa1f6ec47..4a69ec2486a 100644
--- a/src/include/catalog/pg_aggregate.dat
+++ b/src/include/catalog/pg_aggregate.dat
@@ -506,6 +506,8 @@
{ aggfnoid => 'corr', aggtransfn => 'float8_regr_accum',
aggfinalfn => 'float8_corr', aggcombinefn => 'float8_regr_combine',
aggtranstype => '_float8', agginitval => '{0,0,0,0,0,0}' },
+{ aggfnoid => 'xicorr', aggtransfn => 'xicorr_transfn',
+ aggfinalfn => 'float8_xicorr', aggtranstype => 'internal'},
# boolean-and and boolean-or
{ aggfnoid => 'bool_and', aggtransfn => 'booland_statefunc',
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 118d6da1ace..715f3ae04cb 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -5165,6 +5165,12 @@
{ oid => '2817', descr => 'aggregate final function',
proname => 'float8_corr', prorettype => 'float8', proargtypes => '_float8',
prosrc => 'float8_corr' },
+{ oid => '2775', descr => 'xicorr transition function', prokind => 'f',
+ proname => 'xicorr_transfn', prorettype => 'internal', proargtypes => 'internal float8 float8',
+ prosrc => 'xicorr_transfn', proisstrict => 'f', provolatile => 'i', proparallel => 's' },
+{ oid => '3814', descr => 'aggregate final function', prokind => 'f',
+ proname => 'float8_xicorr', prorettype => 'float8', proargtypes => 'internal',
+ prosrc => 'float8_xicorr', proisstrict => 't', provolatile => 'i', proparallel => 's' },
{ oid => '3535', descr => 'aggregate transition function',
proname => 'string_agg_transfn', proisstrict => 'f', prorettype => 'internal',
@@ -7311,6 +7317,9 @@
{ oid => '2829', descr => 'correlation coefficient',
proname => 'corr', prokind => 'a', proisstrict => 'f', prorettype => 'float8',
proargtypes => 'float8 float8', prosrc => 'aggregate_dummy' },
+{ oid => '3063', descr => 'xi correlation coefficient',
+ proname => 'xicorr', prokind => 'a', proisstrict => 'f', prorettype => 'float8',
+ proargtypes => 'float8 float8', prosrc => 'aggregate_dummy' },
{ oid => '2160',
proname => 'text_pattern_lt', proleakproof => 't', prorettype => 'bool',
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index c35288eecde..69969859ea9 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -496,6 +496,16 @@ SELECT corr(b, a) FROM aggtest;
0.139634516517873
(1 row)
+CREATE TEMPORARY TABLE bigger_aggtest AS
+SELECT g AS a,
+ g*g + 2 AS b
+FROM generate_series(1,100) g;
+SELECT xicorr(b, a) FROM bigger_aggtest;
+ xicorr
+------------------
+ 0.97029702970297
+(1 row)
+
-- check single-tuple behavior
SELECT covar_pop(1::float8,2::float8), covar_samp(3::float8,4::float8);
covar_pop | covar_samp
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index 62540b1ffa4..7714fdff281 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -134,6 +134,11 @@ SELECT regr_r2(b, a) FROM aggtest;
SELECT regr_slope(b, a), regr_intercept(b, a) FROM aggtest;
SELECT covar_pop(b, a), covar_samp(b, a) FROM aggtest;
SELECT corr(b, a) FROM aggtest;
+CREATE TEMPORARY TABLE bigger_aggtest AS
+SELECT g AS a,
+ g*g + 2 AS b
+FROM generate_series(1,100) g;
+SELECT xicorr(b, a) FROM bigger_aggtest;
-- check single-tuple behavior
SELECT covar_pop(1::float8,2::float8), covar_samp(3::float8,4::float8);
--
2.39.5 (Apple Git-154)