datasynth_eval/quality/
uniqueness.rs

1//! Uniqueness and duplicate detection evaluation.
2//!
3//! Detects exact duplicates, near-duplicates, and validates primary key uniqueness.
4
5use crate::error::EvalResult;
6use serde::{Deserialize, Serialize};
7use std::collections::{HashMap, HashSet};
8
9/// Results of uniqueness analysis.
10#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct UniquenessAnalysis {
12    /// Total records analyzed.
13    pub total_records: usize,
14    /// Exact duplicate count.
15    pub exact_duplicates: usize,
16    /// Near-duplicate count (similarity > threshold).
17    pub near_duplicates: usize,
18    /// Duplicate rate (0.0-1.0).
19    pub duplicate_rate: f64,
20    /// Primary key collisions.
21    pub pk_collisions: usize,
22    /// Document number collisions.
23    pub doc_number_collisions: usize,
24    /// Duplicate groups (records that are duplicates of each other).
25    pub duplicate_groups: Vec<DuplicateInfo>,
26    /// Uniqueness score (1.0 - duplicate_rate).
27    pub uniqueness_score: f64,
28}
29
30/// Information about a group of duplicates.
31#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct DuplicateInfo {
33    /// Type of duplicate.
34    pub duplicate_type: DuplicateType,
35    /// Number of records in this duplicate group.
36    pub count: usize,
37    /// Example record identifiers.
38    pub example_ids: Vec<String>,
39    /// Similarity score for near-duplicates.
40    pub similarity: Option<f64>,
41}
42
43/// Type of duplicate detected.
44#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
45pub enum DuplicateType {
46    /// Exact match on all fields.
47    Exact,
48    /// High similarity but not exact.
49    NearDuplicate,
50    /// Same primary key.
51    PrimaryKeyCollision,
52    /// Same document number.
53    DocumentNumberCollision,
54}
55
56/// A record for uniqueness checking.
57#[derive(Debug, Clone)]
58pub struct UniqueRecord {
59    /// Primary key value.
60    pub primary_key: String,
61    /// Document number (if applicable).
62    pub document_number: Option<String>,
63    /// Hash of record content for exact duplicate detection.
64    pub content_hash: u64,
65    /// Key fields for near-duplicate detection.
66    pub key_fields: Vec<String>,
67}
68
69/// Analyzer for uniqueness.
70pub struct UniquenessAnalyzer {
71    /// Similarity threshold for near-duplicates.
72    similarity_threshold: f64,
73    /// Maximum duplicates to report in detail.
74    max_report_duplicates: usize,
75}
76
77impl UniquenessAnalyzer {
78    /// Create a new analyzer with specified threshold.
79    pub fn new(similarity_threshold: f64) -> Self {
80        Self {
81            similarity_threshold,
82            max_report_duplicates: 100,
83        }
84    }
85
86    /// Analyze uniqueness of records.
87    pub fn analyze(&self, records: &[UniqueRecord]) -> EvalResult<UniquenessAnalysis> {
88        let total_records = records.len();
89        let mut duplicate_groups = Vec::new();
90
91        // Exact duplicate detection via hash
92        let mut hash_counts: HashMap<u64, Vec<usize>> = HashMap::new();
93        for (idx, record) in records.iter().enumerate() {
94            hash_counts
95                .entry(record.content_hash)
96                .or_default()
97                .push(idx);
98        }
99
100        let mut exact_duplicates = 0;
101        for (_hash, indices) in &hash_counts {
102            if indices.len() > 1 {
103                exact_duplicates += indices.len() - 1;
104                if duplicate_groups.len() < self.max_report_duplicates {
105                    duplicate_groups.push(DuplicateInfo {
106                        duplicate_type: DuplicateType::Exact,
107                        count: indices.len(),
108                        example_ids: indices
109                            .iter()
110                            .take(3)
111                            .map(|&i| records[i].primary_key.clone())
112                            .collect(),
113                        similarity: Some(1.0),
114                    });
115                }
116            }
117        }
118
119        // Primary key collision detection
120        let mut pk_seen: HashSet<&str> = HashSet::new();
121        let mut pk_collisions = 0;
122        for record in records {
123            if !pk_seen.insert(&record.primary_key) {
124                pk_collisions += 1;
125            }
126        }
127
128        // Document number collision detection
129        let mut doc_seen: HashSet<&str> = HashSet::new();
130        let mut doc_number_collisions = 0;
131        for record in records {
132            if let Some(ref doc_num) = record.document_number {
133                if !doc_seen.insert(doc_num) {
134                    doc_number_collisions += 1;
135                }
136            }
137        }
138
139        // Near-duplicate detection (simplified - check key field similarity)
140        let near_duplicates = self.detect_near_duplicates(records, &mut duplicate_groups);
141
142        let duplicate_rate = if total_records > 0 {
143            (exact_duplicates + near_duplicates) as f64 / total_records as f64
144        } else {
145            0.0
146        };
147
148        let uniqueness_score = 1.0 - duplicate_rate;
149
150        Ok(UniquenessAnalysis {
151            total_records,
152            exact_duplicates,
153            near_duplicates,
154            duplicate_rate,
155            pk_collisions,
156            doc_number_collisions,
157            duplicate_groups,
158            uniqueness_score,
159        })
160    }
161
162    /// Detect near-duplicates based on key field similarity.
163    fn detect_near_duplicates(
164        &self,
165        records: &[UniqueRecord],
166        duplicate_groups: &mut Vec<DuplicateInfo>,
167    ) -> usize {
168        let mut near_duplicates = 0;
169
170        // For efficiency, only check a sample if dataset is large
171        let sample_size = records.len().min(1000);
172        let step = if records.len() > sample_size {
173            records.len() / sample_size
174        } else {
175            1
176        };
177
178        let sampled: Vec<_> = records.iter().step_by(step).take(sample_size).collect();
179
180        for i in 0..sampled.len() {
181            for j in (i + 1)..sampled.len() {
182                let sim = self.calculate_similarity(&sampled[i].key_fields, &sampled[j].key_fields);
183                if sim >= self.similarity_threshold && sim < 1.0 {
184                    near_duplicates += 1;
185                    if duplicate_groups.len() < self.max_report_duplicates {
186                        duplicate_groups.push(DuplicateInfo {
187                            duplicate_type: DuplicateType::NearDuplicate,
188                            count: 2,
189                            example_ids: vec![
190                                sampled[i].primary_key.clone(),
191                                sampled[j].primary_key.clone(),
192                            ],
193                            similarity: Some(sim),
194                        });
195                    }
196                }
197            }
198        }
199
200        // Extrapolate if we sampled
201        if step > 1 {
202            near_duplicates = near_duplicates * step * step;
203        }
204
205        near_duplicates
206    }
207
208    /// Calculate Jaccard similarity between two sets of key fields.
209    fn calculate_similarity(&self, fields1: &[String], fields2: &[String]) -> f64 {
210        if fields1.is_empty() && fields2.is_empty() {
211            return 1.0;
212        }
213
214        let set1: HashSet<_> = fields1.iter().collect();
215        let set2: HashSet<_> = fields2.iter().collect();
216
217        let intersection = set1.intersection(&set2).count();
218        let union = set1.union(&set2).count();
219
220        if union == 0 {
221            1.0
222        } else {
223            intersection as f64 / union as f64
224        }
225    }
226}
227
228impl Default for UniquenessAnalyzer {
229    fn default() -> Self {
230        Self::new(0.9) // 90% similarity threshold
231    }
232}
233
234#[cfg(test)]
235mod tests {
236    use super::*;
237    use std::collections::hash_map::DefaultHasher;
238    use std::hash::{Hash, Hasher};
239
240    fn hash_content(s: &str) -> u64 {
241        let mut hasher = DefaultHasher::new();
242        s.hash(&mut hasher);
243        hasher.finish()
244    }
245
246    #[test]
247    fn test_no_duplicates() {
248        let records = vec![
249            UniqueRecord {
250                primary_key: "1".to_string(),
251                document_number: Some("DOC001".to_string()),
252                content_hash: hash_content("record1"),
253                key_fields: vec!["a".to_string(), "b".to_string()],
254            },
255            UniqueRecord {
256                primary_key: "2".to_string(),
257                document_number: Some("DOC002".to_string()),
258                content_hash: hash_content("record2"),
259                key_fields: vec!["c".to_string(), "d".to_string()],
260            },
261        ];
262
263        let analyzer = UniquenessAnalyzer::default();
264        let result = analyzer.analyze(&records).unwrap();
265
266        assert_eq!(result.exact_duplicates, 0);
267        assert_eq!(result.pk_collisions, 0);
268        assert_eq!(result.doc_number_collisions, 0);
269    }
270
271    #[test]
272    fn test_exact_duplicates() {
273        let hash = hash_content("same_content");
274        let records = vec![
275            UniqueRecord {
276                primary_key: "1".to_string(),
277                document_number: Some("DOC001".to_string()),
278                content_hash: hash,
279                key_fields: vec!["a".to_string()],
280            },
281            UniqueRecord {
282                primary_key: "2".to_string(),
283                document_number: Some("DOC002".to_string()),
284                content_hash: hash, // Same hash = duplicate
285                key_fields: vec!["a".to_string()],
286            },
287        ];
288
289        let analyzer = UniquenessAnalyzer::default();
290        let result = analyzer.analyze(&records).unwrap();
291
292        assert_eq!(result.exact_duplicates, 1);
293    }
294
295    #[test]
296    fn test_pk_collision() {
297        let records = vec![
298            UniqueRecord {
299                primary_key: "SAME_PK".to_string(),
300                document_number: None,
301                content_hash: hash_content("record1"),
302                key_fields: vec![],
303            },
304            UniqueRecord {
305                primary_key: "SAME_PK".to_string(),
306                document_number: None,
307                content_hash: hash_content("record2"),
308                key_fields: vec![],
309            },
310        ];
311
312        let analyzer = UniquenessAnalyzer::default();
313        let result = analyzer.analyze(&records).unwrap();
314
315        assert_eq!(result.pk_collisions, 1);
316    }
317}