From 71f59bd83e559df6b36720a4592bf3fdd689504a Mon Sep 17 00:00:00 2001
From: Ilia Evdokimov <ilya.evdokimov@tantorlabs.ru>
Date: Mon, 10 Nov 2025 16:11:43 +0300
Subject: [PATCH v1] Avoid duplicate MCV matching in eqjoinsel_semi and
 eqjoinsel_inner.

Previously both eqjoinsel_inner() and eqjoinsel_semi() performed identical
O(N^2) loops over MCV lists, even though the semi join case always follows
the inner join case in eqjoinsel().  Now the MCV matching results from
eqjoinsel_inner() are reused in eqjoinsel_semi() when possible (i.e., when
the RHS MCV list is not clamped).  This saves redundant computation and
simplifies the code.

Author: Ilia Evdokimov <ilya.evdokimov@tantorlabs.com>
Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us>
Reviewed-by: David Geier <geidav.pg@gmail.com>
---
 src/backend/utils/adt/selfuncs.c | 36 ++++++++++++++++++++++++++------
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index cb23ad52782..55cd0486bf9 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -154,7 +154,9 @@ static double eqjoinsel_inner(Oid opfuncoid, Oid collation,
 							  bool isdefault1, bool isdefault2,
 							  AttStatsSlot *sslot1, AttStatsSlot *sslot2,
 							  Form_pg_statistic stats1, Form_pg_statistic stats2,
-							  bool have_mcvs1, bool have_mcvs2);
+							  bool have_mcvs1, bool have_mcvs2,
+							  double *matchfreq_mcvs1, double *matchfreq_mcvs2,
+							  int *nmatches_mcvs);
 static double eqjoinsel_semi(Oid opfuncoid, Oid collation,
 							 VariableStatData *vardata1, VariableStatData *vardata2,
 							 double nd1, double nd2,
@@ -162,6 +164,7 @@ static double eqjoinsel_semi(Oid opfuncoid, Oid collation,
 							 AttStatsSlot *sslot1, AttStatsSlot *sslot2,
 							 Form_pg_statistic stats1, Form_pg_statistic stats2,
 							 bool have_mcvs1, bool have_mcvs2,
+							 double matchfreq1, int nmatches,
 							 RelOptInfo *inner_rel);
 static bool estimate_multivariate_ndistinct(PlannerInfo *root,
 											RelOptInfo *rel, List **varinfos, double *ndistinct);
@@ -2313,6 +2316,9 @@ eqjoinsel(PG_FUNCTION_ARGS)
 	bool		get_mcv_stats;
 	bool		join_is_reversed;
 	RelOptInfo *inner_rel;
+	int		nmatches_mcvs = 0;
+	double		matchfreq_mcvs1 = 0.0;
+	double		matchfreq_mcvs2 = 0.0;
 
 	get_join_variables(root, args, sjinfo,
 					   &vardata1, &vardata2, &join_is_reversed);
@@ -2367,7 +2373,9 @@ eqjoinsel(PG_FUNCTION_ARGS)
 								  isdefault1, isdefault2,
 								  &sslot1, &sslot2,
 								  stats1, stats2,
-								  have_mcvs1, have_mcvs2);
+								  have_mcvs1, have_mcvs2,
+								  &matchfreq_mcvs1, &matchfreq_mcvs2,
+								  &nmatches_mcvs);
 
 	switch (sjinfo->jointype)
 	{
@@ -2395,6 +2403,7 @@ eqjoinsel(PG_FUNCTION_ARGS)
 									   &sslot1, &sslot2,
 									   stats1, stats2,
 									   have_mcvs1, have_mcvs2,
+									   matchfreq_mcvs1, nmatches_mcvs,
 									   inner_rel);
 			else
 			{
@@ -2408,6 +2417,7 @@ eqjoinsel(PG_FUNCTION_ARGS)
 									   &sslot2, &sslot1,
 									   stats2, stats1,
 									   have_mcvs2, have_mcvs1,
+									   matchfreq_mcvs2, nmatches_mcvs,
 									   inner_rel);
 			}
 
@@ -2455,7 +2465,9 @@ eqjoinsel_inner(Oid opfuncoid, Oid collation,
 				bool isdefault1, bool isdefault2,
 				AttStatsSlot *sslot1, AttStatsSlot *sslot2,
 				Form_pg_statistic stats1, Form_pg_statistic stats2,
-				bool have_mcvs1, bool have_mcvs2)
+				bool have_mcvs1, bool have_mcvs2,
+				double *matchfreq_mcvs1, double *matchfreq_mcvs2,
+				int *nmatches_mcvs)
 {
 	double		selec;
 
@@ -2595,6 +2607,11 @@ eqjoinsel_inner(Oid opfuncoid, Oid collation,
 			totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) /
 				(nd1 - nmatches);
 
+		/* Save MCV match statistics for possible reuse by eqjoinsel_semi() */
+		*matchfreq_mcvs1 = matchfreq1;
+		*matchfreq_mcvs2 = matchfreq2;
+		*nmatches_mcvs = nmatches;
+
 		/*
 		 * Use the smaller of the two estimates.  This can be justified in
 		 * essentially the same terms as given below for the no-stats case: to
@@ -2653,6 +2670,7 @@ eqjoinsel_semi(Oid opfuncoid, Oid collation,
 			   AttStatsSlot *sslot1, AttStatsSlot *sslot2,
 			   Form_pg_statistic stats1, Form_pg_statistic stats2,
 			   bool have_mcvs1, bool have_mcvs2,
+			   double matchfreq1, int nmatches,
 			   RelOptInfo *inner_rel)
 {
 	double		selec;
@@ -2705,11 +2723,9 @@ eqjoinsel_semi(Oid opfuncoid, Oid collation,
 		bool	   *hasmatch1;
 		bool	   *hasmatch2;
 		double		nullfrac1 = stats1->stanullfrac;
-		double		matchfreq1,
-					uncertainfrac,
+		double		uncertainfrac,
 					uncertain;
 		int			i,
-					nmatches,
 					clamped_nvalues2;
 
 		/*
@@ -2721,6 +2737,13 @@ eqjoinsel_semi(Oid opfuncoid, Oid collation,
 		 */
 		clamped_nvalues2 = Min(sslot2->nvalues, nd2);
 
+		/*
+		 * eqjoinsel_inner() normally already did the full MCV comparison,
+		 * so we reuse its results unless RHS MCVs were clamped, in which
+		 * case we must redo the loop for the reduced list.
+		 */
+		if (clamped_nvalues2 != sslot2->nvalues)
+		{
 		fmgr_info(opfuncoid, &eqproc);
 
 		/*
@@ -2777,6 +2800,7 @@ eqjoinsel_semi(Oid opfuncoid, Oid collation,
 		CLAMP_PROBABILITY(matchfreq1);
 		pfree(hasmatch1);
 		pfree(hasmatch2);
+		}
 
 		/*
 		 * Now we need to estimate the fraction of relation 1 that has at
-- 
2.34.1

