v6-0002-ANALYZE-hash-accelerate-MCV-tracking-for-equality.patch
text/x-patch
Filename: v6-0002-ANALYZE-hash-accelerate-MCV-tracking-for-equality.patch
Type: text/x-patch
Part: 0
From 6f081a9525aff0792fd42b2d629133ed5114a6de Mon Sep 17 00:00:00 2001
From: Evdokimov Ilia <ilya.evdokimov@tantorlabs.com>
Date: Sun, 3 May 2026 17:04:34 +0300
Subject: [PATCH v6 2/2] ANALYZE: hash-accelerate MCV tracking for
equality-only types
compute_distinct_stats() still performs a linear search of track[]
for each sampled value. At higher statistics targets, that lookup cost
can dominate ANALYZE time for equality-only datatypes.
When the type cache's default hash support matches the equality
operator, and the statistics target is at least
ANALYZE_HASH_THRESHOLD, maintain a simplehash table that maps tracked
values to their current track[] slots. That reduces match lookup from
linear to O(1) on average.
Entries that move or are replaced update the hash table so track[] and
the lookup structure stay in sync, while the existing linear path
remains available as a fallback.
---
src/backend/commands/analyze.c | 171 ++++++++++++++++++++++++++++++---
1 file changed, 160 insertions(+), 11 deletions(-)
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 262adf41b24..2e6fbe5cc12 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -55,6 +55,7 @@
#include "utils/sortsupport.h"
#include "utils/syscache.h"
#include "utils/timestamp.h"
+#include "utils/typcache.h"
/* Per-index data for ANALYZE */
typedef struct AnlIndexData
@@ -1900,6 +1901,12 @@ ind_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull)
*/
#define WIDTH_THRESHOLD 1024
+/*
+ * Build a hash table for distinct/MCV tracking only when the statistics
+ * target is large enough to justify the overhead of maintaining it.
+ */
+#define ANALYZE_HASH_THRESHOLD 100
+
#define swapInt(a,b) do {int _tmp; _tmp=a; a=b; b=_tmp;} while(0)
#define swapDatum(a,b) do {Datum _tmp; _tmp=a; a=b; b=_tmp;} while(0)
@@ -1918,7 +1925,29 @@ typedef struct
int *tupnoLink;
} CompareScalarsContext;
+/* Entries in the simplehash hash table used by compute_distinct_stats */
+typedef struct DistinctHashEntry
+{
+ Datum value; /* the value represented by this entry */
+ int index; /* its index in the relevant track */
+ uint32 hash; /* hash code for the Datum */
+ char status; /* status code used by simplehash.h */
+} DistinctHashEntry;
+
+/* private_data for the simplehash hash table */
+typedef struct DistinctHashContext
+{
+ FmgrInfo *eqfunc; /* the equality operator */
+ FmgrInfo *hashfunc; /* the hash function to use */
+ Oid collation; /* collation to use equality and hash calls */
+} DistinctHashContext;
+
+/* forward reference */
+typedef struct DistinctHash_hash DistinctHash_hash;
+
+static uint32 distinct_hash_hash(DistinctHash_hash * tab, Datum key);
+static bool distinct_hash_equal(DistinctHash_hash * tab, Datum key0, Datum key1);
static void compute_trivial_stats(VacAttrStatsP stats,
AnalyzeAttrFetchFunc fetchfunc,
int samplerows,
@@ -1940,6 +1969,53 @@ static int analyze_mcv_list(int *mcv_counts,
int samplerows,
double totalrows);
+/* Define support routines for compute distinct values hash tables */
+#define SH_PREFIX DistinctHash
+#define SH_ELEMENT_TYPE DistinctHashEntry
+#define SH_KEY_TYPE Datum
+#define SH_KEY value
+#define SH_HASH_KEY(tab, key) distinct_hash_hash(tab, key)
+#define SH_EQUAL(tab, key0, key1) distinct_hash_equal(tab, key0, key1)
+#define SH_SCOPE static inline
+#define SH_STORE_HASH
+#define SH_GET_HASH(tab, ent) ((ent)->hash)
+#define SH_DEFINE
+#define SH_DECLARE
+#include "lib/simplehash.h"
+
+
+/*
+ * Support functions for the hash tables used by compute_distinct_stats
+ */
+static uint32
+distinct_hash_hash(DistinctHash_hash * tab, Datum key)
+{
+ DistinctHashContext *context = (DistinctHashContext *) tab->private_data;
+ Datum result;
+
+ result = FunctionCall1Coll(context->hashfunc, context->collation, key);
+ return DatumGetUInt32(result);
+}
+
+static bool
+distinct_hash_equal(DistinctHash_hash * tab, Datum key0, Datum key1)
+{
+ DistinctHashContext *context = (DistinctHashContext *) tab->private_data;
+ Datum result;
+
+ result = FunctionCall2Coll(context->eqfunc, context->collation, key0, key1);
+ return DatumGetBool(result);
+}
+
+static inline void
+distinct_hash_set_index(DistinctHash_hash * hash, Datum value,
+ uint32 value_hash, int index)
+{
+ DistinctHashEntry *entry = DistinctHash_lookup_hash(hash, value, value_hash);
+
+ Assert(entry != NULL);
+ entry->index = index;
+}
/*
* std_typanalyze -- the default type-specific typanalyze function
@@ -2129,15 +2205,20 @@ compute_distinct_stats(VacAttrStatsP stats,
bool is_varwidth = (!stats->attrtype->typbyval &&
stats->attrtype->typlen < 0);
FmgrInfo f_cmpeq;
+ TypeCacheEntry *typentry;
typedef struct
{
Datum value;
int count;
+ uint32 hash;
} TrackItem;
TrackItem *track;
int track_cnt,
track_max;
int num_mcv = stats->attstattarget;
+ bool use_hash;
+ DistinctHashContext hash_context;
+ DistinctHash_hash *track_hash = NULL;
int firstcount1 = 0, /* index of first singleton in track[] */
c1_cursor = 0; /* next singleton to evict (FIFO) */
StdAnalyzeData *mystats = (StdAnalyzeData *) stats->extra_data;
@@ -2152,6 +2233,26 @@ compute_distinct_stats(VacAttrStatsP stats,
track_cnt = 0;
fmgr_info(mystats->eqfunc, &f_cmpeq);
+ typentry = lookup_type_cache(stats->attrtypid,
+ TYPECACHE_HASH_PROC_FINFO | TYPECACHE_EQ_OPR);
+
+ /*
+ * For sufficiently large statistics targets, use a hash table to avoid
+ * repeated linear searches of the track[] array, but only when we can use
+ * the type's default hash support that matches the equality operator.
+ */
+ use_hash = (num_mcv >= ANALYZE_HASH_THRESHOLD &&
+ mystats->eqopr == typentry->eq_opr &&
+ OidIsValid(typentry->hash_proc));
+
+ if (use_hash)
+ {
+ hash_context.eqfunc = &f_cmpeq;
+ hash_context.hashfunc = &typentry->hash_proc_finfo;
+ hash_context.collation = stats->attrcollid;
+ track_hash = DistinctHash_create(CurrentMemoryContext, track_max,
+ &hash_context);
+ }
for (i = 0; i < samplerows; i++)
{
@@ -2160,6 +2261,7 @@ compute_distinct_stats(VacAttrStatsP stats,
bool match;
int match_index,
j;
+ uint32 value_hash = 0;
vacuum_delay_point(true);
@@ -2206,20 +2308,33 @@ compute_distinct_stats(VacAttrStatsP stats,
/*
* See if the value matches anything we're already tracking.
*/
- match = false;
- firstcount1 = track_cnt;
- for (j = 0; j < track_cnt; j++)
+ if (use_hash)
{
- if (DatumGetBool(FunctionCall2Coll(&f_cmpeq,
- stats->attrcollid,
- value, track[j].value)))
+ DistinctHashEntry *entry;
+
+ value_hash = distinct_hash_hash(track_hash, value);
+ entry = DistinctHash_lookup_hash(track_hash, value, value_hash);
+ match = (entry != NULL);
+ if (match)
+ match_index = entry->index;
+ }
+ else
+ {
+ match = false;
+ firstcount1 = track_cnt;
+ for (j = 0; j < track_cnt; j++)
{
- match = true;
- match_index = j;
- break;
+ if (DatumGetBool(FunctionCall2Coll(&f_cmpeq,
+ stats->attrcollid,
+ value, track[j].value)))
+ {
+ match = true;
+ match_index = j;
+ break;
+ }
+ if (j < firstcount1 && track[j].count == 1)
+ firstcount1 = j;
}
- if (j < firstcount1 && track[j].count == 1)
- firstcount1 = j;
}
if (match)
@@ -2234,6 +2349,18 @@ compute_distinct_stats(VacAttrStatsP stats,
{
swapDatum(track[j].value, track[j - 1].value);
swapInt(track[j].count, track[j - 1].count);
+ if (use_hash)
+ {
+ uint32 tmp;
+
+ tmp = track[j].hash;
+ track[j].hash = track[j - 1].hash;
+ track[j - 1].hash = tmp;
+ distinct_hash_set_index(track_hash, track[j].value,
+ track[j].hash, j);
+ distinct_hash_set_index(track_hash, track[j - 1].value,
+ track[j - 1].hash, j - 1);
+ }
}
/*
@@ -2281,12 +2408,34 @@ compute_distinct_stats(VacAttrStatsP stats,
insert_index = c1_cursor++;
if (c1_cursor >= track_cnt)
c1_cursor = firstcount1;
+
+ if (use_hash)
+ {
+ DistinctHashEntry *delentry;
+
+ delentry = DistinctHash_lookup_hash(track_hash,
+ track[insert_index].value,
+ track[insert_index].hash);
+ Assert(delentry != NULL);
+ DistinctHash_delete_item(track_hash, delentry);
+ }
}
else
continue;
track[insert_index].value = value;
track[insert_index].count = 1;
+ if (use_hash)
+ {
+ bool found_hash;
+ DistinctHashEntry *entry;
+
+ track[insert_index].hash = value_hash;
+ entry = DistinctHash_insert_hash(track_hash, value, value_hash,
+ &found_hash);
+ Assert(!found_hash);
+ entry->index = insert_index;
+ }
}
}
--
2.34.1