Skip to main content

datasynth_generators/data_quality/
duplicates.rs

1//! Duplicate record generation for data quality simulation.
2//!
3//! Simulates realistic duplicate scenarios:
4//! - Exact duplicates (complete record duplication)
5//! - Near duplicates (minor variations)
6//! - Fuzzy duplicates (similar but not identical)
7//! - Cross-system duplicates (different identifiers, same entity)
8
9use chrono::{Duration, NaiveDate};
10use rand::Rng;
11use rust_decimal::Decimal;
12use serde::{Deserialize, Serialize};
13
14/// Type of duplicate.
15#[derive(Debug, Clone, PartialEq)]
16pub enum DuplicateType {
17    /// Complete exact duplicate.
18    Exact,
19    /// Near duplicate with minor variations.
20    Near {
21        /// Fields that vary.
22        varying_fields: Vec<String>,
23    },
24    /// Fuzzy duplicate with significant but recognizable differences.
25    Fuzzy {
26        /// Similarity threshold (0.0 - 1.0).
27        similarity: f64,
28    },
29    /// Cross-system duplicate (same entity, different identifiers).
30    CrossSystem {
31        /// Source system identifier.
32        source_system: String,
33        /// Target system identifier.
34        target_system: String,
35    },
36}
37
38/// Configuration for duplicate generation.
39#[derive(Debug, Clone)]
40pub struct DuplicateConfig {
41    /// Overall duplicate rate.
42    pub duplicate_rate: f64,
43    /// Exact duplicate rate (of duplicates).
44    pub exact_rate: f64,
45    /// Near duplicate rate (of duplicates).
46    pub near_rate: f64,
47    /// Fuzzy duplicate rate (of duplicates).
48    pub fuzzy_rate: f64,
49    /// Maximum days between duplicate entries.
50    pub max_date_offset_days: i64,
51    /// Fields that commonly vary in near duplicates.
52    pub varying_fields: Vec<String>,
53    /// Amount variance for near duplicates (percentage).
54    pub amount_variance: f64,
55}
56
57impl Default for DuplicateConfig {
58    fn default() -> Self {
59        Self {
60            duplicate_rate: 0.005, // 0.5% of records get duplicated
61            exact_rate: 0.3,       // 30% of duplicates are exact
62            near_rate: 0.5,        // 50% are near duplicates
63            fuzzy_rate: 0.2,       // 20% are fuzzy
64            max_date_offset_days: 5,
65            varying_fields: vec![
66                "entry_date".to_string(),
67                "created_by".to_string(),
68                "description".to_string(),
69            ],
70            amount_variance: 0.01, // 1% variance
71        }
72    }
73}
74
75/// A duplicate record with metadata.
76#[derive(Debug, Clone)]
77pub struct DuplicateRecord<T: Clone> {
78    /// The original record.
79    pub original: T,
80    /// The duplicate record.
81    pub duplicate: T,
82    /// Type of duplicate.
83    pub duplicate_type: DuplicateType,
84    /// Fields that differ.
85    pub differing_fields: Vec<String>,
86    /// Duplicate ID for tracking.
87    pub duplicate_id: String,
88}
89
90/// Trait for records that can be duplicated.
91pub trait Duplicatable: Clone {
92    /// Returns the record's unique identifier.
93    fn get_id(&self) -> String;
94
95    /// Sets a new identifier.
96    fn set_id(&mut self, id: String);
97
98    /// Gets a field value by name.
99    fn get_field(&self, field: &str) -> Option<String>;
100
101    /// Sets a field value by name.
102    fn set_field(&mut self, field: &str, value: &str);
103
104    /// Gets the amount (for amount-bearing records).
105    fn get_amount(&self) -> Option<Decimal>;
106
107    /// Sets the amount.
108    fn set_amount(&mut self, amount: Decimal);
109
110    /// Gets the date.
111    fn get_date(&self) -> Option<NaiveDate>;
112
113    /// Sets the date.
114    fn set_date(&mut self, date: NaiveDate);
115}
116
117/// Duplicate generator.
118pub struct DuplicateGenerator {
119    config: DuplicateConfig,
120    stats: DuplicateStats,
121    next_duplicate_id: u64,
122}
123
124/// Statistics for duplicate generation.
125#[derive(Debug, Clone, Default, Serialize, Deserialize)]
126pub struct DuplicateStats {
127    /// Total records processed.
128    pub total_processed: usize,
129    /// Total duplicates created.
130    pub total_duplicates: usize,
131    /// Exact duplicates.
132    pub exact_duplicates: usize,
133    /// Near duplicates.
134    pub near_duplicates: usize,
135    /// Fuzzy duplicates.
136    pub fuzzy_duplicates: usize,
137    /// Cross-system duplicates.
138    pub cross_system_duplicates: usize,
139}
140
141impl DuplicateGenerator {
142    /// Creates a new duplicate generator.
143    pub fn new(config: DuplicateConfig) -> Self {
144        Self {
145            config,
146            stats: DuplicateStats::default(),
147            next_duplicate_id: 1,
148        }
149    }
150
151    /// Determines if a record should be duplicated.
152    pub fn should_duplicate<R: Rng>(&self, rng: &mut R) -> bool {
153        rng.gen::<f64>() < self.config.duplicate_rate
154    }
155
156    /// Creates a duplicate of a record.
157    pub fn create_duplicate<T: Duplicatable, R: Rng>(
158        &mut self,
159        record: &T,
160        rng: &mut R,
161    ) -> DuplicateRecord<T> {
162        self.stats.total_processed += 1;
163        self.stats.total_duplicates += 1;
164
165        let duplicate_type = self.select_duplicate_type(rng);
166        let mut duplicate = record.clone();
167        let mut differing_fields = Vec::new();
168
169        // Generate new ID
170        let new_id = format!("{}-DUP{}", record.get_id(), self.next_duplicate_id);
171        self.next_duplicate_id += 1;
172        duplicate.set_id(new_id);
173        differing_fields.push("id".to_string());
174
175        match &duplicate_type {
176            DuplicateType::Exact => {
177                self.stats.exact_duplicates += 1;
178                // No other changes needed
179            }
180            DuplicateType::Near { varying_fields } => {
181                self.stats.near_duplicates += 1;
182                self.apply_near_duplicate_variations(&mut duplicate, varying_fields, rng);
183                differing_fields.extend(varying_fields.clone());
184            }
185            DuplicateType::Fuzzy { similarity } => {
186                self.stats.fuzzy_duplicates += 1;
187                let varied = self.apply_fuzzy_variations(&mut duplicate, *similarity, rng);
188                differing_fields.extend(varied);
189            }
190            DuplicateType::CrossSystem {
191                source_system: _,
192                target_system,
193            } => {
194                self.stats.cross_system_duplicates += 1;
195                // Change system identifier
196                if let Some(_current_id) = duplicate.get_field("system_id") {
197                    duplicate.set_field("system_id", target_system);
198                    differing_fields.push("system_id".to_string());
199                }
200            }
201        }
202
203        let duplicate_id = format!("DUP{:08}", self.stats.total_duplicates);
204
205        DuplicateRecord {
206            original: record.clone(),
207            duplicate,
208            duplicate_type,
209            differing_fields,
210            duplicate_id,
211        }
212    }
213
214    /// Selects the type of duplicate to create.
215    fn select_duplicate_type<R: Rng>(&self, rng: &mut R) -> DuplicateType {
216        let r = rng.gen::<f64>();
217
218        if r < self.config.exact_rate {
219            DuplicateType::Exact
220        } else if r < self.config.exact_rate + self.config.near_rate {
221            DuplicateType::Near {
222                varying_fields: self.config.varying_fields.clone(),
223            }
224        } else {
225            DuplicateType::Fuzzy {
226                similarity: rng.gen_range(0.8..0.95),
227            }
228        }
229    }
230
231    /// Applies near-duplicate variations.
232    fn apply_near_duplicate_variations<T: Duplicatable, R: Rng>(
233        &self,
234        record: &mut T,
235        varying_fields: &[String],
236        rng: &mut R,
237    ) {
238        for field in varying_fields {
239            match field.as_str() {
240                "entry_date" | "date" => {
241                    if let Some(date) = record.get_date() {
242                        let offset = rng.gen_range(
243                            -self.config.max_date_offset_days..=self.config.max_date_offset_days,
244                        );
245                        record.set_date(date + Duration::days(offset));
246                    }
247                }
248                "amount" | "debit_amount" | "credit_amount" => {
249                    if let Some(amount) = record.get_amount() {
250                        let variance = 1.0
251                            + rng.gen_range(
252                                -self.config.amount_variance..self.config.amount_variance,
253                            );
254                        let new_amount =
255                            amount * Decimal::from_f64_retain(variance).unwrap_or(Decimal::ONE);
256                        record.set_amount(new_amount.round_dp(2));
257                    }
258                }
259                "description" => {
260                    if let Some(desc) = record.get_field("description") {
261                        // Add minor variation
262                        let variations = [
263                            format!("{} ", desc),
264                            format!(" {}", desc),
265                            desc.to_uppercase(),
266                            desc.to_lowercase(),
267                        ];
268                        let variation = &variations[rng.gen_range(0..variations.len())];
269                        record.set_field("description", variation);
270                    }
271                }
272                _ => {
273                    // Generic variation: add whitespace
274                    if let Some(value) = record.get_field(field) {
275                        record.set_field(field, &format!("{} ", value));
276                    }
277                }
278            }
279        }
280    }
281
282    /// Applies fuzzy variations (more significant changes).
283    fn apply_fuzzy_variations<T: Duplicatable, R: Rng>(
284        &self,
285        record: &mut T,
286        similarity: f64,
287        rng: &mut R,
288    ) -> Vec<String> {
289        let mut varied_fields = Vec::new();
290        let change_probability = 1.0 - similarity;
291
292        // Amount variation
293        if rng.gen::<f64>() < change_probability {
294            if let Some(amount) = record.get_amount() {
295                let variance = 1.0 + rng.gen_range(-0.1..0.1); // Up to 10% variation
296                let new_amount =
297                    amount * Decimal::from_f64_retain(variance).unwrap_or(Decimal::ONE);
298                record.set_amount(new_amount.round_dp(2));
299                varied_fields.push("amount".to_string());
300            }
301        }
302
303        // Date variation
304        if rng.gen::<f64>() < change_probability {
305            if let Some(date) = record.get_date() {
306                let offset = rng.gen_range(-30..=30);
307                record.set_date(date + Duration::days(offset));
308                varied_fields.push("date".to_string());
309            }
310        }
311
312        // Description variation
313        if rng.gen::<f64>() < change_probability {
314            if let Some(desc) = record.get_field("description") {
315                // Introduce typos or abbreviations
316                let abbreviated = abbreviate_text(&desc);
317                record.set_field("description", &abbreviated);
318                varied_fields.push("description".to_string());
319            }
320        }
321
322        varied_fields
323    }
324
325    /// Returns statistics.
326    pub fn stats(&self) -> &DuplicateStats {
327        &self.stats
328    }
329
330    /// Resets statistics.
331    pub fn reset_stats(&mut self) {
332        self.stats = DuplicateStats::default();
333    }
334}
335
336/// Abbreviates text by replacing common words.
337fn abbreviate_text(text: &str) -> String {
338    let abbreviations = [
339        ("Account", "Acct"),
340        ("Payment", "Pmt"),
341        ("Invoice", "Inv"),
342        ("Number", "No"),
343        ("Department", "Dept"),
344        ("Company", "Co"),
345        ("Corporation", "Corp"),
346        ("International", "Intl"),
347        ("Management", "Mgmt"),
348        ("Reference", "Ref"),
349    ];
350
351    let mut result = text.to_string();
352    for (full, abbr) in abbreviations {
353        result = result.replace(full, abbr);
354    }
355    result
356}
357
358/// Detects potential duplicates in a dataset.
359pub struct DuplicateDetector {
360    /// Similarity threshold for fuzzy matching.
361    similarity_threshold: f64,
362    /// Fields to compare.
363    comparison_fields: Vec<String>,
364}
365
366impl DuplicateDetector {
367    /// Creates a new duplicate detector.
368    pub fn new(similarity_threshold: f64, comparison_fields: Vec<String>) -> Self {
369        Self {
370            similarity_threshold,
371            comparison_fields,
372        }
373    }
374
375    /// Calculates similarity between two strings (Jaccard similarity).
376    pub fn string_similarity(&self, a: &str, b: &str) -> f64 {
377        if a == b {
378            return 1.0;
379        }
380
381        let a_chars: std::collections::HashSet<char> = a.chars().collect();
382        let b_chars: std::collections::HashSet<char> = b.chars().collect();
383
384        let intersection = a_chars.intersection(&b_chars).count();
385        let union = a_chars.union(&b_chars).count();
386
387        if union == 0 {
388            0.0
389        } else {
390            intersection as f64 / union as f64
391        }
392    }
393
394    /// Checks if two records are potential duplicates.
395    pub fn are_duplicates<T: Duplicatable>(&self, a: &T, b: &T) -> bool {
396        let mut total_similarity = 0.0;
397        let mut field_count = 0;
398
399        for field in &self.comparison_fields {
400            if let (Some(val_a), Some(val_b)) = (a.get_field(field), b.get_field(field)) {
401                total_similarity += self.string_similarity(&val_a, &val_b);
402                field_count += 1;
403            }
404        }
405
406        // Also compare amounts if available
407        if let (Some(amt_a), Some(amt_b)) = (a.get_amount(), b.get_amount()) {
408            let amt_a_f64: f64 = amt_a.try_into().unwrap_or(0.0);
409            let amt_b_f64: f64 = amt_b.try_into().unwrap_or(0.0);
410
411            if amt_a_f64.abs() > 0.0 {
412                let ratio = (amt_a_f64 - amt_b_f64).abs() / amt_a_f64.abs();
413                total_similarity += 1.0 - ratio.min(1.0);
414                field_count += 1;
415            }
416        }
417
418        if field_count == 0 {
419            return false;
420        }
421
422        let avg_similarity = total_similarity / field_count as f64;
423        avg_similarity >= self.similarity_threshold
424    }
425
426    /// Finds all duplicate pairs in a collection.
427    pub fn find_duplicates<T: Duplicatable>(&self, records: &[T]) -> Vec<(usize, usize, f64)> {
428        let mut duplicates = Vec::new();
429
430        for i in 0..records.len() {
431            for j in (i + 1)..records.len() {
432                if self.are_duplicates(&records[i], &records[j]) {
433                    let mut similarity = 0.0;
434                    let mut count = 0;
435
436                    for field in &self.comparison_fields {
437                        if let (Some(a), Some(b)) =
438                            (records[i].get_field(field), records[j].get_field(field))
439                        {
440                            similarity += self.string_similarity(&a, &b);
441                            count += 1;
442                        }
443                    }
444
445                    if count > 0 {
446                        duplicates.push((i, j, similarity / count as f64));
447                    }
448                }
449            }
450        }
451
452        duplicates
453    }
454}
455
456#[cfg(test)]
457#[allow(clippy::unwrap_used)]
458mod tests {
459    use super::*;
460
461    // Simple test struct implementing Duplicatable
462    #[derive(Clone)]
463    struct TestRecord {
464        id: String,
465        description: String,
466        amount: Decimal,
467        date: NaiveDate,
468    }
469
470    impl Duplicatable for TestRecord {
471        fn get_id(&self) -> String {
472            self.id.clone()
473        }
474
475        fn set_id(&mut self, id: String) {
476            self.id = id;
477        }
478
479        fn get_field(&self, field: &str) -> Option<String> {
480            match field {
481                "description" => Some(self.description.clone()),
482                "id" => Some(self.id.clone()),
483                _ => None,
484            }
485        }
486
487        fn set_field(&mut self, field: &str, value: &str) {
488            if field == "description" {
489                self.description = value.to_string();
490            }
491        }
492
493        fn get_amount(&self) -> Option<Decimal> {
494            Some(self.amount)
495        }
496
497        fn set_amount(&mut self, amount: Decimal) {
498            self.amount = amount;
499        }
500
501        fn get_date(&self) -> Option<NaiveDate> {
502            Some(self.date)
503        }
504
505        fn set_date(&mut self, date: NaiveDate) {
506            self.date = date;
507        }
508    }
509
510    #[test]
511    fn test_duplicate_generation() {
512        use rand::SeedableRng;
513        use rand_chacha::ChaCha8Rng;
514        use rust_decimal_macros::dec;
515
516        let config = DuplicateConfig::default();
517        let mut generator = DuplicateGenerator::new(config);
518        let mut rng = ChaCha8Rng::seed_from_u64(42);
519
520        let record = TestRecord {
521            id: "JE001".to_string(),
522            description: "Test Entry".to_string(),
523            amount: dec!(1000),
524            date: NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
525        };
526
527        let duplicate = generator.create_duplicate(&record, &mut rng);
528
529        assert_ne!(duplicate.duplicate.get_id(), record.get_id());
530        assert_eq!(generator.stats().total_duplicates, 1);
531    }
532
533    #[test]
534    fn test_string_similarity() {
535        let detector = DuplicateDetector::new(0.8, vec!["description".to_string()]);
536
537        assert_eq!(detector.string_similarity("hello", "hello"), 1.0);
538        assert!(detector.string_similarity("hello", "helo") > 0.8);
539        assert!(detector.string_similarity("abc", "xyz") < 0.5);
540    }
541
542    #[test]
543    fn test_abbreviate_text() {
544        let text = "Account Payment Invoice";
545        let abbreviated = abbreviate_text(text);
546        assert_eq!(abbreviated, "Acct Pmt Inv");
547    }
548}