use std::collections::HashSet;
use crate::agent::Evidence;
const MIN_EVIDENCE_CHARS: usize = 50;
const SIMILARITY_THRESHOLD: f64 = 0.8;
pub fn dedup(evidence: &[Evidence]) -> Vec<Evidence> {
let quality: Vec<&Evidence> = evidence
.iter()
.filter(|e| e.content.len() >= MIN_EVIDENCE_CHARS)
.collect();
let mut seen_sources: HashSet<String> = HashSet::new();
let source_deduped: Vec<&Evidence> = quality
.into_iter()
.filter(|e| {
let key = format!("{}:{}", e.doc_name.as_deref().unwrap_or(""), e.source_path);
seen_sources.insert(key)
})
.collect();
let mut deduped: Vec<Evidence> = Vec::new();
for ev in source_deduped {
let tokens = tokenize(&ev.content);
let dominated = deduped
.iter()
.any(|existing| jaccard(&tokens, &tokenize(&existing.content)) >= SIMILARITY_THRESHOLD);
if !dominated {
deduped.push(ev.clone());
}
}
deduped
}
fn tokenize(text: &str) -> HashSet<String> {
text.to_lowercase()
.split_whitespace()
.map(|s| s.to_string())
.collect()
}
fn jaccard(a: &HashSet<String>, b: &HashSet<String>) -> f64 {
if a.is_empty() && b.is_empty() {
return 1.0;
}
let intersection = a.intersection(b).count() as f64;
let union = a.union(b).count() as f64;
intersection / union
}
#[cfg(test)]
mod tests {
use super::*;
fn make_evidence(title: &str, content: &str) -> Evidence {
Evidence {
source_path: format!("root/{}", title),
node_title: title.to_string(),
content: content.to_string(),
doc_name: Some("doc".to_string()),
}
}
#[test]
fn test_quality_filter() {
let evidence = vec![
make_evidence("A", "short"), make_evidence("B", &"x".repeat(60)), ];
let result = dedup(&evidence);
assert_eq!(result.len(), 1);
assert_eq!(result[0].node_title, "B");
}
#[test]
fn test_source_dedup() {
let evidence = vec![
make_evidence(
"A",
&"content A with enough text to pass the quality filter threshold".to_string(),
),
make_evidence(
"A",
&"different content A but same source path that is long enough".to_string(),
),
];
let result = dedup(&evidence);
assert_eq!(result.len(), 1);
}
#[test]
fn test_content_similarity_dedup() {
let base = "This is a piece of evidence about machine learning algorithms and their applications in real world scenarios".to_string();
let similar = "This is a piece of evidence about machine learning algorithms and their applications in real world".to_string();
let different =
"Completely unrelated content about quantum physics and particle accelerators at CERN"
.to_string();
let evidence = vec![
make_evidence("A", &base),
make_evidence("B", &similar), make_evidence("C", &different), ];
let result = dedup(&evidence);
assert!(result.len() >= 2); }
#[test]
fn test_empty_input() {
let result = dedup(&[]);
assert!(result.is_empty());
}
#[test]
fn test_jaccard_identical() {
let a = tokenize("hello world foo");
let b = tokenize("hello world foo");
assert!((jaccard(&a, &b) - 1.0).abs() < 0.001);
}
#[test]
fn test_jaccard_disjoint() {
let a = tokenize("aaa bbb");
let b = tokenize("ccc ddd");
assert!((jaccard(&a, &b)).abs() < 0.001);
}
}