use crate::error::EvalResult;
use serde::{Deserialize, Serialize};
use std::collections::{HashMap, HashSet};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct UniquenessAnalysis {
pub total_records: usize,
pub exact_duplicates: usize,
pub near_duplicates: usize,
pub duplicate_rate: f64,
pub pk_collisions: usize,
pub doc_number_collisions: usize,
pub duplicate_groups: Vec<DuplicateInfo>,
pub uniqueness_score: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DuplicateInfo {
pub duplicate_type: DuplicateType,
pub count: usize,
pub example_ids: Vec<String>,
pub similarity: Option<f64>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum DuplicateType {
Exact,
NearDuplicate,
PrimaryKeyCollision,
DocumentNumberCollision,
}
#[derive(Debug, Clone)]
pub struct UniqueRecord {
pub primary_key: String,
pub document_number: Option<String>,
pub content_hash: u64,
pub key_fields: Vec<String>,
}
pub struct UniquenessAnalyzer {
similarity_threshold: f64,
max_report_duplicates: usize,
}
impl UniquenessAnalyzer {
pub fn new(similarity_threshold: f64) -> Self {
Self {
similarity_threshold,
max_report_duplicates: 100,
}
}
pub fn analyze(&self, records: &[UniqueRecord]) -> EvalResult<UniquenessAnalysis> {
let total_records = records.len();
let mut duplicate_groups = Vec::new();
let mut hash_counts: HashMap<u64, Vec<usize>> = HashMap::new();
for (idx, record) in records.iter().enumerate() {
hash_counts
.entry(record.content_hash)
.or_default()
.push(idx);
}
let mut exact_duplicates = 0;
for indices in hash_counts.values() {
if indices.len() > 1 {
exact_duplicates += indices.len() - 1;
if duplicate_groups.len() < self.max_report_duplicates {
duplicate_groups.push(DuplicateInfo {
duplicate_type: DuplicateType::Exact,
count: indices.len(),
example_ids: indices
.iter()
.take(3)
.map(|&i| records[i].primary_key.clone())
.collect(),
similarity: Some(1.0),
});
}
}
}
let mut pk_seen: HashSet<&str> = HashSet::new();
let mut pk_collisions = 0;
for record in records {
if !pk_seen.insert(&record.primary_key) {
pk_collisions += 1;
}
}
let mut doc_seen: HashSet<&str> = HashSet::new();
let mut doc_number_collisions = 0;
for record in records {
if let Some(ref doc_num) = record.document_number {
if !doc_seen.insert(doc_num) {
doc_number_collisions += 1;
}
}
}
let near_duplicates = self.detect_near_duplicates(records, &mut duplicate_groups);
let duplicate_rate = if total_records > 0 {
(exact_duplicates + near_duplicates) as f64 / total_records as f64
} else {
0.0
};
let uniqueness_score = 1.0 - duplicate_rate;
Ok(UniquenessAnalysis {
total_records,
exact_duplicates,
near_duplicates,
duplicate_rate,
pk_collisions,
doc_number_collisions,
duplicate_groups,
uniqueness_score,
})
}
fn detect_near_duplicates(
&self,
records: &[UniqueRecord],
duplicate_groups: &mut Vec<DuplicateInfo>,
) -> usize {
let mut near_duplicates = 0;
let sample_size = records.len().min(1000);
let step = if records.len() > sample_size {
records.len() / sample_size
} else {
1
};
let sampled: Vec<_> = records.iter().step_by(step).take(sample_size).collect();
for i in 0..sampled.len() {
for j in (i + 1)..sampled.len() {
let sim = self.calculate_similarity(&sampled[i].key_fields, &sampled[j].key_fields);
if sim >= self.similarity_threshold && sim < 1.0 {
near_duplicates += 1;
if duplicate_groups.len() < self.max_report_duplicates {
duplicate_groups.push(DuplicateInfo {
duplicate_type: DuplicateType::NearDuplicate,
count: 2,
example_ids: vec![
sampled[i].primary_key.clone(),
sampled[j].primary_key.clone(),
],
similarity: Some(sim),
});
}
}
}
}
if step > 1 {
near_duplicates = near_duplicates * step * step;
}
near_duplicates
}
fn calculate_similarity(&self, fields1: &[String], fields2: &[String]) -> f64 {
if fields1.is_empty() && fields2.is_empty() {
return 1.0;
}
let set1: HashSet<_> = fields1.iter().collect();
let set2: HashSet<_> = fields2.iter().collect();
let intersection = set1.intersection(&set2).count();
let union = set1.union(&set2).count();
if union == 0 {
1.0
} else {
intersection as f64 / union as f64
}
}
}
impl Default for UniquenessAnalyzer {
fn default() -> Self {
Self::new(0.9) }
}
#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
use super::*;
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
fn hash_content(s: &str) -> u64 {
let mut hasher = DefaultHasher::new();
s.hash(&mut hasher);
hasher.finish()
}
#[test]
fn test_no_duplicates() {
let records = vec![
UniqueRecord {
primary_key: "1".to_string(),
document_number: Some("DOC001".to_string()),
content_hash: hash_content("record1"),
key_fields: vec!["a".to_string(), "b".to_string()],
},
UniqueRecord {
primary_key: "2".to_string(),
document_number: Some("DOC002".to_string()),
content_hash: hash_content("record2"),
key_fields: vec!["c".to_string(), "d".to_string()],
},
];
let analyzer = UniquenessAnalyzer::default();
let result = analyzer.analyze(&records).unwrap();
assert_eq!(result.exact_duplicates, 0);
assert_eq!(result.pk_collisions, 0);
assert_eq!(result.doc_number_collisions, 0);
}
#[test]
fn test_exact_duplicates() {
let hash = hash_content("same_content");
let records = vec![
UniqueRecord {
primary_key: "1".to_string(),
document_number: Some("DOC001".to_string()),
content_hash: hash,
key_fields: vec!["a".to_string()],
},
UniqueRecord {
primary_key: "2".to_string(),
document_number: Some("DOC002".to_string()),
content_hash: hash, key_fields: vec!["a".to_string()],
},
];
let analyzer = UniquenessAnalyzer::default();
let result = analyzer.analyze(&records).unwrap();
assert_eq!(result.exact_duplicates, 1);
}
#[test]
fn test_pk_collision() {
let records = vec![
UniqueRecord {
primary_key: "SAME_PK".to_string(),
document_number: None,
content_hash: hash_content("record1"),
key_fields: vec![],
},
UniqueRecord {
primary_key: "SAME_PK".to_string(),
document_number: None,
content_hash: hash_content("record2"),
key_fields: vec![],
},
];
let analyzer = UniquenessAnalyzer::default();
let result = analyzer.analyze(&records).unwrap();
assert_eq!(result.pk_collisions, 1);
}
}