v0-0001-Memoise-the-inner-of-SEMI-and-ANTI-join.patch

text/plain

Filename: v0-0001-Memoise-the-inner-of-SEMI-and-ANTI-join.patch
Type: text/plain
Part: 0
Message: Memoize ANTI and SEMI JOIN inner

Patch

Same data as JSON: GET /api/v1/attachments/:id/patch the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes. API reference →
Format: format-patch
Series: patch v0-0001
Subject: Memoise the inner of SEMI- and ANTI-join.
File+
src/backend/commands/explain.c 9 0
src/backend/optimizer/path/costsize.c 1 1
src/backend/optimizer/path/joinpath.c 3 2
src/backend/optimizer/util/pathnode.c 1 1
src/test/regress/expected/join.out 2 1
src/test/regress/expected/subselect.out 2 1
From e47abde1f6a2c8c8d7fa588df72e59c0cd049d4c Mon Sep 17 00:00:00 2001
From: "Andrei V. Lepikhov" <lepihov@gmail.com>
Date: Wed, 25 Dec 2024 14:33:12 +0700
Subject: [PATCH v0] Memoise the inner of SEMI- and ANTI-join.

To get the result of semi-join or anti-join, we only need one tuple from
the parameterised inner. The Postgres core already includes Memoize's
single_row mode for the case when it is proved that inner returns only a single
value. Thus, implementing similar logic for semi-joins and anti-joins is
quite feasible.

It is clear that the semi-join operates correctly: it returns the outer tuple
as soon as the first tuple is received from the inner query. In contrast, the
behaviour of the anti-join is less straightforward. If anti-join had a join
clause not pushed to the parameterised inner, it would be possible to read more
than a single tuple from the inner, which causes inconsistency. However,
it seems that this is an impossible case for the current planner.
---
 src/backend/commands/explain.c          | 9 +++++++++
 src/backend/optimizer/path/costsize.c   | 2 +-
 src/backend/optimizer/path/joinpath.c   | 5 +++--
 src/backend/optimizer/util/pathnode.c   | 2 +-
 src/test/regress/expected/join.out      | 3 ++-
 src/test/regress/expected/subselect.out | 3 ++-
 6 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index d8a7232cedb..c90067dadcb 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -3632,6 +3632,10 @@ show_memoize_info(MemoizeState *mstate, List *ancestors, ExplainState *es)
 	{
 		ExplainPropertyText("Cache Key", keystr.data, es);
 		ExplainPropertyText("Cache Mode", mstate->binary_mode ? "binary" : "logical", es);
+
+		/* Report only in the single mode case to not break current tests */
+		if (mstate->singlerow)
+			ExplainPropertyText("Store Mode", "singlerow", es);
 	}
 	else
 	{
@@ -3639,6 +3643,11 @@ show_memoize_info(MemoizeState *mstate, List *ancestors, ExplainState *es)
 		appendStringInfo(es->str, "Cache Key: %s\n", keystr.data);
 		ExplainIndentText(es);
 		appendStringInfo(es->str, "Cache Mode: %s\n", mstate->binary_mode ? "binary" : "logical");
+		if (mstate->singlerow)
+		{
+			ExplainIndentText(es);
+			appendStringInfo(es->str, "Store Mode: %s\n", "singlerow");
+		}
 	}
 
 	pfree(keystr.data);
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 73d78617009..4cca5a7395c 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -2545,7 +2545,7 @@ cost_memoize_rescan(PlannerInfo *root, MemoizePath *mpath,
 	ListCell   *lc;
 	Cost		input_startup_cost = mpath->subpath->startup_cost;
 	Cost		input_total_cost = mpath->subpath->total_cost;
-	double		tuples = mpath->subpath->rows;
+	double		tuples = mpath->singlerow ? 1 : mpath->subpath->rows;
 	double		calls = mpath->calls;
 	int			width = mpath->subpath->pathtarget->width;
 
diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c
index 18891ce9156..89738b483ef 100644
--- a/src/backend/optimizer/path/joinpath.c
+++ b/src/backend/optimizer/path/joinpath.c
@@ -682,6 +682,7 @@ get_memoize_path(PlannerInfo *root, RelOptInfo *innerrel,
 	ListCell   *lc;
 	bool		binary_mode;
 	List	   *ph_lateral_vars;
+	bool		single_mode = false;
 
 	/* Obviously not if it's disabled */
 	if (!enable_memoize)
@@ -725,7 +726,7 @@ get_memoize_path(PlannerInfo *root, RelOptInfo *innerrel,
 	 */
 	if (!extra->inner_unique && (jointype == JOIN_SEMI ||
 								 jointype == JOIN_ANTI))
-		return NULL;
+		single_mode = true;
 
 	/*
 	 * Memoize normally marks cache entries as complete when it runs out of
@@ -808,7 +809,7 @@ get_memoize_path(PlannerInfo *root, RelOptInfo *innerrel,
 											inner_path,
 											param_exprs,
 											hash_operators,
-											extra->inner_unique,
+											extra->inner_unique || single_mode,
 											binary_mode,
 											outer_path->rows);
 	}
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 93e73cb44db..9b4c0f96193 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1707,7 +1707,7 @@ create_memoize_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath,
 	 */
 	pathnode->path.startup_cost = subpath->startup_cost + cpu_tuple_cost;
 	pathnode->path.total_cost = subpath->total_cost + cpu_tuple_cost;
-	pathnode->path.rows = subpath->rows;
+	pathnode->path.rows = (pathnode->singlerow) ? 1 : subpath->rows;
 
 	return pathnode;
 }
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out
index a57bb18c24f..e8293c3c87d 100644
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -6469,10 +6469,11 @@ select * from sj t1
    ->  Memoize
          Cache Key: t1.a, t1.b
          Cache Mode: binary
+         Store Mode: singlerow
          ->  Sample Scan on sj
                Sampling: system (t1.b)
                Filter: (t1.a = a)
-(8 rows)
+(9 rows)
 
 -- Ensure that SJE does not form a self-referential lateral dependency
 explain (costs off)
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index d0db8a412ff..5ab56f8d71f 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -2642,6 +2642,7 @@ ON B.hundred in (SELECT min(c.hundred) FROM tenk2 C WHERE c.odd = b.odd);
                ->  Memoize
                      Cache Key: b.hundred, b.odd
                      Cache Mode: binary
+                     Store Mode: singlerow
                      ->  Subquery Scan on "ANY_subquery"
                            Filter: (b.hundred = "ANY_subquery".min)
                            ->  Result
@@ -2650,5 +2651,5 @@ ON B.hundred in (SELECT min(c.hundred) FROM tenk2 C WHERE c.odd = b.odd);
                                          ->  Index Scan using tenk2_hundred on tenk2 c
                                                Index Cond: (hundred IS NOT NULL)
                                                Filter: (odd = b.odd)
-(16 rows)
+(17 rows)
 
-- 
2.48.1