use std::collections::hash_map::DefaultHasher;
use std::collections::HashSet;
use std::hash::{Hash, Hasher};
use async_trait::async_trait;
use cognis_core::documents::Document;
use cognis_core::error::Result;
use super::DocumentTransformer;
pub struct ExactDeduplicator;
impl ExactDeduplicator {
pub fn new() -> Self {
Self
}
}
impl Default for ExactDeduplicator {
fn default() -> Self {
Self::new()
}
}
#[async_trait]
impl DocumentTransformer for ExactDeduplicator {
async fn transform_documents(&self, documents: &[Document]) -> Result<Vec<Document>> {
let mut seen = HashSet::new();
let mut results = Vec::new();
for doc in documents {
if seen.insert(doc.page_content.clone()) {
results.push(doc.clone());
}
}
Ok(results)
}
fn name(&self) -> &str {
"ExactDeduplicator"
}
}
pub struct FuzzyDeduplicator {
similarity_threshold: f64,
ngram_size: usize,
}
impl FuzzyDeduplicator {
pub fn new() -> Self {
Self {
similarity_threshold: 0.9,
ngram_size: 3,
}
}
pub fn with_similarity_threshold(mut self, threshold: f64) -> Self {
self.similarity_threshold = threshold;
self
}
pub fn with_ngram_size(mut self, size: usize) -> Self {
self.ngram_size = size;
self
}
}
impl Default for FuzzyDeduplicator {
fn default() -> Self {
Self::new()
}
}
fn char_ngrams(text: &str, n: usize) -> HashSet<String> {
let chars: Vec<char> = text.chars().collect();
if chars.len() < n {
let mut set = HashSet::new();
set.insert(text.to_string());
return set;
}
chars
.windows(n)
.map(|w| w.iter().collect::<String>())
.collect()
}
fn jaccard_similarity(a: &HashSet<String>, b: &HashSet<String>) -> f64 {
if a.is_empty() && b.is_empty() {
return 1.0;
}
let intersection = a.intersection(b).count();
let union = a.union(b).count();
if union == 0 {
return 0.0;
}
intersection as f64 / union as f64
}
#[async_trait]
impl DocumentTransformer for FuzzyDeduplicator {
async fn transform_documents(&self, documents: &[Document]) -> Result<Vec<Document>> {
let mut kept: Vec<(Document, HashSet<String>)> = Vec::new();
for doc in documents {
let ngrams = char_ngrams(&doc.page_content, self.ngram_size);
let is_duplicate = kept.iter().any(|(_, kept_ngrams)| {
jaccard_similarity(&ngrams, kept_ngrams) >= self.similarity_threshold
});
if !is_duplicate {
kept.push((doc.clone(), ngrams));
}
}
Ok(kept.into_iter().map(|(doc, _)| doc).collect())
}
fn name(&self) -> &str {
"FuzzyDeduplicator"
}
}
pub struct ContentHashDeduplicator;
impl ContentHashDeduplicator {
pub fn new() -> Self {
Self
}
}
impl Default for ContentHashDeduplicator {
fn default() -> Self {
Self::new()
}
}
fn compute_hash(text: &str) -> u64 {
let mut hasher = DefaultHasher::new();
text.hash(&mut hasher);
hasher.finish()
}
#[async_trait]
impl DocumentTransformer for ContentHashDeduplicator {
async fn transform_documents(&self, documents: &[Document]) -> Result<Vec<Document>> {
let mut seen = HashSet::new();
let mut results = Vec::new();
for doc in documents {
let hash = compute_hash(&doc.page_content);
if seen.insert(hash) {
results.push(doc.clone());
}
}
Ok(results)
}
fn name(&self) -> &str {
"ContentHashDeduplicator"
}
}
#[cfg(test)]
mod tests {
use super::*;
fn make_doc(content: &str) -> Document {
Document::new(content)
}
#[tokio::test]
async fn test_exact_dedup_removes_duplicates() {
let dedup = ExactDeduplicator::new();
let docs = vec![
make_doc("hello world"),
make_doc("hello world"),
make_doc("different text"),
];
let result = dedup.transform_documents(&docs).await.unwrap();
assert_eq!(result.len(), 2);
assert_eq!(result[0].page_content, "hello world");
assert_eq!(result[1].page_content, "different text");
}
#[tokio::test]
async fn test_exact_dedup_all_unique() {
let dedup = ExactDeduplicator::new();
let docs = vec![make_doc("a"), make_doc("b"), make_doc("c")];
let result = dedup.transform_documents(&docs).await.unwrap();
assert_eq!(result.len(), 3);
}
#[tokio::test]
async fn test_exact_dedup_empty() {
let dedup = ExactDeduplicator::new();
let result = dedup.transform_documents(&[]).await.unwrap();
assert!(result.is_empty());
}
#[tokio::test]
async fn test_exact_dedup_all_same() {
let dedup = ExactDeduplicator::new();
let docs = vec![make_doc("same"), make_doc("same"), make_doc("same")];
let result = dedup.transform_documents(&docs).await.unwrap();
assert_eq!(result.len(), 1);
}
#[tokio::test]
async fn test_fuzzy_dedup_removes_near_duplicates() {
let dedup = FuzzyDeduplicator::new().with_similarity_threshold(0.8);
let docs = vec![
make_doc("the quick brown fox jumps over the lazy dog"),
make_doc("the quick brown fox jumps over the lazy cat"),
make_doc("completely different content about rust programming"),
];
let result = dedup.transform_documents(&docs).await.unwrap();
assert_eq!(result.len(), 2);
assert_eq!(
result[0].page_content,
"the quick brown fox jumps over the lazy dog"
);
assert_eq!(
result[1].page_content,
"completely different content about rust programming"
);
}
#[tokio::test]
async fn test_fuzzy_dedup_keeps_dissimilar() {
let dedup = FuzzyDeduplicator::new().with_similarity_threshold(0.95);
let docs = vec![
make_doc("rust is a systems programming language"),
make_doc("python is a dynamic programming language"),
];
let result = dedup.transform_documents(&docs).await.unwrap();
assert_eq!(result.len(), 2);
}
#[tokio::test]
async fn test_fuzzy_dedup_exact_duplicates() {
let dedup = FuzzyDeduplicator::new();
let docs = vec![make_doc("identical text"), make_doc("identical text")];
let result = dedup.transform_documents(&docs).await.unwrap();
assert_eq!(result.len(), 1);
}
#[tokio::test]
async fn test_hash_dedup_removes_duplicates() {
let dedup = ContentHashDeduplicator::new();
let docs = vec![
make_doc("hello world"),
make_doc("hello world"),
make_doc("different"),
];
let result = dedup.transform_documents(&docs).await.unwrap();
assert_eq!(result.len(), 2);
}
#[tokio::test]
async fn test_hash_dedup_empty() {
let dedup = ContentHashDeduplicator::new();
let result = dedup.transform_documents(&[]).await.unwrap();
assert!(result.is_empty());
}
#[tokio::test]
async fn test_hash_dedup_preserves_order() {
let dedup = ContentHashDeduplicator::new();
let docs = vec![
make_doc("first"),
make_doc("second"),
make_doc("first"),
make_doc("third"),
];
let result = dedup.transform_documents(&docs).await.unwrap();
assert_eq!(result.len(), 3);
assert_eq!(result[0].page_content, "first");
assert_eq!(result[1].page_content, "second");
assert_eq!(result[2].page_content, "third");
}
#[test]
fn test_jaccard_identical() {
let a = char_ngrams("hello world", 3);
let b = char_ngrams("hello world", 3);
assert!((jaccard_similarity(&a, &b) - 1.0).abs() < 1e-10);
}
#[test]
fn test_jaccard_completely_different() {
let a = char_ngrams("aaaa", 3);
let b = char_ngrams("zzzz", 3);
assert!((jaccard_similarity(&a, &b)).abs() < 1e-10);
}
#[test]
fn test_char_ngrams_short_text() {
let ngrams = char_ngrams("ab", 3);
assert_eq!(ngrams.len(), 1);
assert!(ngrams.contains("ab"));
}
}