#![cfg_attr(not(feature = "sal"), allow(dead_code))]
use std::collections::HashSet;
use crate::embeddings::Embedder;
use crate::models::Memory;
use super::pipeline::MemoryId;
pub(crate) const JACCARD_THRESHOLD: f64 = 0.55;
pub(crate) const MAX_CLUSTER_SIZE: usize = 8;
pub(crate) const DEFAULT_COSINE_THRESHOLD: f32 = 0.75;
pub(crate) struct JaccardClustering {
pub(crate) threshold: f64,
pub(crate) max_cluster_size: usize,
}
impl Default for JaccardClustering {
fn default() -> Self {
Self {
threshold: JACCARD_THRESHOLD,
max_cluster_size: MAX_CLUSTER_SIZE,
}
}
}
impl JaccardClustering {
pub(crate) fn cluster_memories(&self, memories: &[Memory]) -> Vec<Vec<MemoryId>> {
let mut by_ns: std::collections::HashMap<&str, Vec<&Memory>> =
std::collections::HashMap::new();
for m in memories {
if m.namespace.starts_with('_') {
continue;
}
by_ns.entry(&m.namespace).or_default().push(m);
}
let mut clusters: Vec<Vec<MemoryId>> = Vec::new();
for (_ns, group) in by_ns {
let mut used = vec![false; group.len()];
for i in 0..group.len() {
if used[i] {
continue;
}
let mut cluster = vec![group[i].id.clone()];
used[i] = true;
for j in (i + 1)..group.len() {
if used[j] {
continue;
}
if cluster.len() >= self.max_cluster_size {
break;
}
if jaccard_similarity(&group[i].content, &group[j].content) >= self.threshold {
cluster.push(group[j].id.clone());
used[j] = true;
}
}
if cluster.len() >= 2 {
clusters.push(cluster);
}
}
}
clusters
}
}
pub(crate) struct CosineClustering {
pub(crate) threshold: f32,
pub(crate) max_cluster_size: usize,
pub(crate) embedder: Option<Embedder>,
}
impl CosineClustering {
pub(crate) fn new(embedder: Option<Embedder>) -> Self {
Self {
threshold: DEFAULT_COSINE_THRESHOLD,
max_cluster_size: MAX_CLUSTER_SIZE,
embedder,
}
}
pub(crate) fn cluster_memories(&self, memories: &[Memory]) -> Vec<Vec<MemoryId>> {
let Some(ref embedder) = self.embedder else {
return Vec::new();
};
let embedded: Vec<(&Memory, Vec<f32>)> = memories
.iter()
.filter(|m| !m.namespace.starts_with('_'))
.filter_map(|m| embedder.embed(&m.content).ok().map(|v| (m, v)))
.collect();
if embedded.is_empty() {
return Vec::new();
}
let n = embedded.len();
let mut assigned = vec![false; n];
let mut clusters: Vec<Vec<MemoryId>> = Vec::new();
for i in 0..n {
if assigned[i] {
continue;
}
let mut cluster = vec![embedded[i].0.id.clone()];
assigned[i] = true;
for j in (i + 1)..n {
if assigned[j] {
continue;
}
if cluster.len() >= self.max_cluster_size {
break;
}
if embedded[i].0.namespace != embedded[j].0.namespace {
continue;
}
let sim = Embedder::cosine_similarity(&embedded[i].1, &embedded[j].1);
if sim >= self.threshold {
cluster.push(embedded[j].0.id.clone());
assigned[j] = true;
}
}
if cluster.len() >= 2 {
clusters.push(cluster);
}
}
clusters
}
}
pub(super) fn jaccard_similarity(a: &str, b: &str) -> f64 {
let tokens = |s: &str| -> HashSet<String> {
s.split(|c: char| !c.is_alphanumeric())
.filter(|t| t.len() >= 3)
.map(str::to_lowercase)
.collect()
};
let ta = tokens(a);
let tb = tokens(b);
if ta.is_empty() && tb.is_empty() {
return 0.0;
}
let inter = ta.intersection(&tb).count();
let union = ta.union(&tb).count();
if union == 0 {
0.0
} else {
#[allow(clippy::cast_precision_loss)]
let result = inter as f64 / union as f64;
result
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::models::{Memory, Tier};
fn make_memory(id: &str, ns: &str, content: &str) -> Memory {
let now = chrono::Utc::now().to_rfc3339();
Memory {
id: id.to_string(),
tier: Tier::Long,
namespace: ns.to_string(),
title: id.to_string(),
content: content.to_string(),
tags: vec![],
priority: 5,
confidence: 1.0,
source: "test".to_string(),
access_count: 0,
created_at: now.clone(),
updated_at: now,
last_accessed_at: None,
expires_at: None,
metadata: serde_json::json!({}),
reflection_depth: 0,
memory_kind: crate::models::MemoryKind::Observation,
entity_id: None,
persona_version: None,
citations: Vec::new(),
source_uri: None,
source_span: None,
confidence_source: crate::models::ConfidenceSource::CallerProvided,
confidence_signals: None,
confidence_decayed_at: None,
version: 1,
}
}
#[test]
fn jaccard_identical_strings() {
let s = "kubernetes rolling canary deploy strategy kubernetes deploy";
assert!((jaccard_similarity(s, s) - 1.0).abs() < 1e-9);
}
#[test]
fn jaccard_disjoint_strings() {
let a = "apple banana cherry";
let b = "delta echo foxtrot";
assert_eq!(jaccard_similarity(a, b), 0.0);
}
#[test]
fn jaccard_partial_overlap() {
let a = "rust programming language memory safety";
let b = "rust language systems programming";
let sim = jaccard_similarity(a, b);
assert!(sim > 0.0 && sim < 1.0, "sim={sim}");
}
#[test]
fn jaccard_empty_strings() {
assert_eq!(jaccard_similarity("", ""), 0.0);
}
#[test]
fn jaccard_clustering_groups_near_duplicates() {
let strategy = JaccardClustering::default();
let m1 = make_memory(
"a",
"ns",
"kubernetes rolling canary deploy strategy kubernetes deploy",
);
let m2 = make_memory(
"b",
"ns",
"kubernetes rolling canary deploy strategy kubernetes deploy",
);
let m3 = make_memory("c", "ns", "completely different unrelated content here");
let clusters = strategy.cluster_memories(&[m1, m2, m3]);
assert_eq!(clusters.len(), 1, "expected one cluster; got {clusters:?}");
let cluster = &clusters[0];
assert!(cluster.contains(&"a".to_string()));
assert!(cluster.contains(&"b".to_string()));
assert!(!cluster.contains(&"c".to_string()));
}
#[test]
fn jaccard_clustering_never_merges_across_namespaces() {
let strategy = JaccardClustering::default();
let m1 = make_memory("a", "ns1", "kubernetes rolling canary deploy strategy");
let m2 = make_memory("b", "ns2", "kubernetes rolling canary deploy strategy");
let clusters = strategy.cluster_memories(&[m1, m2]);
assert!(
clusters.is_empty(),
"expected no cross-ns clusters; got {clusters:?}"
);
}
#[test]
fn jaccard_clustering_skips_internal_namespaces() {
let strategy = JaccardClustering::default();
let m1 = make_memory("a", "_curator", "kubernetes rolling canary deploy strategy");
let m2 = make_memory("b", "_curator", "kubernetes rolling canary deploy strategy");
let clusters = strategy.cluster_memories(&[m1, m2]);
assert!(clusters.is_empty(), "internal ns must be skipped");
}
#[test]
fn jaccard_clustering_respects_max_cluster_size() {
let strategy = JaccardClustering {
threshold: 0.0, max_cluster_size: 3,
};
let mems: Vec<Memory> = (0..10)
.map(|i| make_memory(&format!("m{i}"), "ns", "shared token content shared"))
.collect();
let clusters = strategy.cluster_memories(&mems);
for c in &clusters {
assert!(c.len() <= 3, "cluster size {}", c.len());
}
}
#[test]
fn cosine_clustering_without_embedder_returns_empty() {
let strategy = CosineClustering::new(None);
let m1 = make_memory("a", "ns", "kubernetes rolling canary deploy strategy");
let m2 = make_memory("b", "ns", "kubernetes rolling canary deploy strategy");
let clusters = strategy.cluster_memories(&[m1, m2]);
assert!(clusters.is_empty());
}
#[test]
fn jaccard_clustering_with_empty_input_returns_empty() {
let strategy = JaccardClustering::default();
let clusters = strategy.cluster_memories(&[]);
assert!(clusters.is_empty());
}
#[test]
fn jaccard_clustering_skips_already_used_member() {
let strategy = JaccardClustering {
threshold: 0.3,
max_cluster_size: 10,
};
let s = "shared keyword tokens deployment plan strategy";
let m1 = make_memory("a", "ns", s);
let m2 = make_memory("b", "ns", s);
let m3 = make_memory("c", "ns", s);
let clusters = strategy.cluster_memories(&[m1, m2, m3]);
assert_eq!(clusters.len(), 1);
assert_eq!(clusters[0].len(), 3);
}
fn try_local_embedder() -> Option<Embedder> {
Embedder::new_local().ok()
}
#[test]
fn cosine_clustering_with_embedder_clusters_similar_content() {
let Some(embedder) = try_local_embedder() else {
return;
};
let strategy = CosineClustering::new(Some(embedder));
let m1 = make_memory(
"a",
"ns",
"Kubernetes rolling canary deployment strategy notes",
);
let m2 = make_memory(
"b",
"ns",
"Kubernetes rolling canary deployment strategy notes",
);
let clusters = strategy.cluster_memories(&[m1, m2]);
assert_eq!(clusters.len(), 1);
assert_eq!(clusters[0].len(), 2);
}
#[test]
fn cosine_clustering_never_merges_across_namespaces() {
let Some(embedder) = try_local_embedder() else {
return;
};
let strategy = CosineClustering::new(Some(embedder));
let m1 = make_memory("a", "ns1", "identical content for both rows");
let m2 = make_memory("b", "ns2", "identical content for both rows");
let clusters = strategy.cluster_memories(&[m1, m2]);
assert!(clusters.is_empty());
}
#[test]
fn cosine_clustering_skips_internal_namespaces() {
let Some(embedder) = try_local_embedder() else {
return;
};
let strategy = CosineClustering::new(Some(embedder));
let m1 = make_memory("a", "_curator", "shared content tokens");
let m2 = make_memory("b", "_curator", "shared content tokens");
let clusters = strategy.cluster_memories(&[m1, m2]);
assert!(clusters.is_empty());
}
#[test]
fn cosine_clustering_respects_max_cluster_size() {
let Some(embedder) = try_local_embedder() else {
return;
};
let strategy = CosineClustering {
threshold: 0.5,
max_cluster_size: 2,
embedder: Some(embedder),
};
let s = "identical clustering content goes here always";
let mems: Vec<Memory> = (0..6)
.map(|i| make_memory(&format!("m{i}"), "ns", s))
.collect();
let clusters = strategy.cluster_memories(&mems);
for c in &clusters {
assert!(c.len() <= 2, "cluster size {}", c.len());
}
}
#[test]
fn cosine_clustering_drops_low_similarity_singletons() {
let Some(embedder) = try_local_embedder() else {
return;
};
let strategy = CosineClustering::new(Some(embedder));
let topics = [
"Python list comprehension idioms for filtering",
"Reverse-engineering binary protocols on the wire",
"Cherry-picking commits across forked git branches",
"Distributed consensus by Raft leader election",
"Cuisine of southern Italy and Sicilian olive oil",
"Quantum-mechanical interpretation of double-slit experiment",
];
let mems: Vec<Memory> = topics
.iter()
.enumerate()
.map(|(i, t)| make_memory(&format!("m{i}"), "ns", t))
.collect();
let clusters = strategy.cluster_memories(&mems);
for c in &clusters {
assert!(c.len() >= 2);
}
}
}