1use crate::error::EvalResult;
6use serde::{Deserialize, Serialize};
7use std::collections::{HashMap, HashSet};
8
9#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct UniquenessAnalysis {
12 pub total_records: usize,
14 pub exact_duplicates: usize,
16 pub near_duplicates: usize,
18 pub duplicate_rate: f64,
20 pub pk_collisions: usize,
22 pub doc_number_collisions: usize,
24 pub duplicate_groups: Vec<DuplicateInfo>,
26 pub uniqueness_score: f64,
28}
29
30#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct DuplicateInfo {
33 pub duplicate_type: DuplicateType,
35 pub count: usize,
37 pub example_ids: Vec<String>,
39 pub similarity: Option<f64>,
41}
42
43#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
45pub enum DuplicateType {
46 Exact,
48 NearDuplicate,
50 PrimaryKeyCollision,
52 DocumentNumberCollision,
54}
55
56#[derive(Debug, Clone)]
58pub struct UniqueRecord {
59 pub primary_key: String,
61 pub document_number: Option<String>,
63 pub content_hash: u64,
65 pub key_fields: Vec<String>,
67}
68
69pub struct UniquenessAnalyzer {
71 similarity_threshold: f64,
73 max_report_duplicates: usize,
75}
76
77impl UniquenessAnalyzer {
78 pub fn new(similarity_threshold: f64) -> Self {
80 Self {
81 similarity_threshold,
82 max_report_duplicates: 100,
83 }
84 }
85
86 pub fn analyze(&self, records: &[UniqueRecord]) -> EvalResult<UniquenessAnalysis> {
88 let total_records = records.len();
89 let mut duplicate_groups = Vec::new();
90
91 let mut hash_counts: HashMap<u64, Vec<usize>> = HashMap::new();
93 for (idx, record) in records.iter().enumerate() {
94 hash_counts
95 .entry(record.content_hash)
96 .or_default()
97 .push(idx);
98 }
99
100 let mut exact_duplicates = 0;
101 for (_hash, indices) in &hash_counts {
102 if indices.len() > 1 {
103 exact_duplicates += indices.len() - 1;
104 if duplicate_groups.len() < self.max_report_duplicates {
105 duplicate_groups.push(DuplicateInfo {
106 duplicate_type: DuplicateType::Exact,
107 count: indices.len(),
108 example_ids: indices
109 .iter()
110 .take(3)
111 .map(|&i| records[i].primary_key.clone())
112 .collect(),
113 similarity: Some(1.0),
114 });
115 }
116 }
117 }
118
119 let mut pk_seen: HashSet<&str> = HashSet::new();
121 let mut pk_collisions = 0;
122 for record in records {
123 if !pk_seen.insert(&record.primary_key) {
124 pk_collisions += 1;
125 }
126 }
127
128 let mut doc_seen: HashSet<&str> = HashSet::new();
130 let mut doc_number_collisions = 0;
131 for record in records {
132 if let Some(ref doc_num) = record.document_number {
133 if !doc_seen.insert(doc_num) {
134 doc_number_collisions += 1;
135 }
136 }
137 }
138
139 let near_duplicates = self.detect_near_duplicates(records, &mut duplicate_groups);
141
142 let duplicate_rate = if total_records > 0 {
143 (exact_duplicates + near_duplicates) as f64 / total_records as f64
144 } else {
145 0.0
146 };
147
148 let uniqueness_score = 1.0 - duplicate_rate;
149
150 Ok(UniquenessAnalysis {
151 total_records,
152 exact_duplicates,
153 near_duplicates,
154 duplicate_rate,
155 pk_collisions,
156 doc_number_collisions,
157 duplicate_groups,
158 uniqueness_score,
159 })
160 }
161
162 fn detect_near_duplicates(
164 &self,
165 records: &[UniqueRecord],
166 duplicate_groups: &mut Vec<DuplicateInfo>,
167 ) -> usize {
168 let mut near_duplicates = 0;
169
170 let sample_size = records.len().min(1000);
172 let step = if records.len() > sample_size {
173 records.len() / sample_size
174 } else {
175 1
176 };
177
178 let sampled: Vec<_> = records.iter().step_by(step).take(sample_size).collect();
179
180 for i in 0..sampled.len() {
181 for j in (i + 1)..sampled.len() {
182 let sim = self.calculate_similarity(&sampled[i].key_fields, &sampled[j].key_fields);
183 if sim >= self.similarity_threshold && sim < 1.0 {
184 near_duplicates += 1;
185 if duplicate_groups.len() < self.max_report_duplicates {
186 duplicate_groups.push(DuplicateInfo {
187 duplicate_type: DuplicateType::NearDuplicate,
188 count: 2,
189 example_ids: vec![
190 sampled[i].primary_key.clone(),
191 sampled[j].primary_key.clone(),
192 ],
193 similarity: Some(sim),
194 });
195 }
196 }
197 }
198 }
199
200 if step > 1 {
202 near_duplicates = near_duplicates * step * step;
203 }
204
205 near_duplicates
206 }
207
208 fn calculate_similarity(&self, fields1: &[String], fields2: &[String]) -> f64 {
210 if fields1.is_empty() && fields2.is_empty() {
211 return 1.0;
212 }
213
214 let set1: HashSet<_> = fields1.iter().collect();
215 let set2: HashSet<_> = fields2.iter().collect();
216
217 let intersection = set1.intersection(&set2).count();
218 let union = set1.union(&set2).count();
219
220 if union == 0 {
221 1.0
222 } else {
223 intersection as f64 / union as f64
224 }
225 }
226}
227
228impl Default for UniquenessAnalyzer {
229 fn default() -> Self {
230 Self::new(0.9) }
232}
233
234#[cfg(test)]
235mod tests {
236 use super::*;
237 use std::collections::hash_map::DefaultHasher;
238 use std::hash::{Hash, Hasher};
239
240 fn hash_content(s: &str) -> u64 {
241 let mut hasher = DefaultHasher::new();
242 s.hash(&mut hasher);
243 hasher.finish()
244 }
245
246 #[test]
247 fn test_no_duplicates() {
248 let records = vec![
249 UniqueRecord {
250 primary_key: "1".to_string(),
251 document_number: Some("DOC001".to_string()),
252 content_hash: hash_content("record1"),
253 key_fields: vec!["a".to_string(), "b".to_string()],
254 },
255 UniqueRecord {
256 primary_key: "2".to_string(),
257 document_number: Some("DOC002".to_string()),
258 content_hash: hash_content("record2"),
259 key_fields: vec!["c".to_string(), "d".to_string()],
260 },
261 ];
262
263 let analyzer = UniquenessAnalyzer::default();
264 let result = analyzer.analyze(&records).unwrap();
265
266 assert_eq!(result.exact_duplicates, 0);
267 assert_eq!(result.pk_collisions, 0);
268 assert_eq!(result.doc_number_collisions, 0);
269 }
270
271 #[test]
272 fn test_exact_duplicates() {
273 let hash = hash_content("same_content");
274 let records = vec![
275 UniqueRecord {
276 primary_key: "1".to_string(),
277 document_number: Some("DOC001".to_string()),
278 content_hash: hash,
279 key_fields: vec!["a".to_string()],
280 },
281 UniqueRecord {
282 primary_key: "2".to_string(),
283 document_number: Some("DOC002".to_string()),
284 content_hash: hash, key_fields: vec!["a".to_string()],
286 },
287 ];
288
289 let analyzer = UniquenessAnalyzer::default();
290 let result = analyzer.analyze(&records).unwrap();
291
292 assert_eq!(result.exact_duplicates, 1);
293 }
294
295 #[test]
296 fn test_pk_collision() {
297 let records = vec![
298 UniqueRecord {
299 primary_key: "SAME_PK".to_string(),
300 document_number: None,
301 content_hash: hash_content("record1"),
302 key_fields: vec![],
303 },
304 UniqueRecord {
305 primary_key: "SAME_PK".to_string(),
306 document_number: None,
307 content_hash: hash_content("record2"),
308 key_fields: vec![],
309 },
310 ];
311
312 let analyzer = UniquenessAnalyzer::default();
313 let result = analyzer.analyze(&records).unwrap();
314
315 assert_eq!(result.pk_collisions, 1);
316 }
317}