Skip to main content

datasynth_generators/data_quality/
duplicates.rs

1//! Duplicate record generation for data quality simulation.
2//!
3//! Simulates realistic duplicate scenarios:
4//! - Exact duplicates (complete record duplication)
5//! - Near duplicates (minor variations)
6//! - Fuzzy duplicates (similar but not identical)
7//! - Cross-system duplicates (different identifiers, same entity)
8
9use chrono::{Duration, NaiveDate};
10use rand::Rng;
11use rust_decimal::Decimal;
12
13/// Type of duplicate.
14#[derive(Debug, Clone, PartialEq)]
15pub enum DuplicateType {
16    /// Complete exact duplicate.
17    Exact,
18    /// Near duplicate with minor variations.
19    Near {
20        /// Fields that vary.
21        varying_fields: Vec<String>,
22    },
23    /// Fuzzy duplicate with significant but recognizable differences.
24    Fuzzy {
25        /// Similarity threshold (0.0 - 1.0).
26        similarity: f64,
27    },
28    /// Cross-system duplicate (same entity, different identifiers).
29    CrossSystem {
30        /// Source system identifier.
31        source_system: String,
32        /// Target system identifier.
33        target_system: String,
34    },
35}
36
37/// Configuration for duplicate generation.
38#[derive(Debug, Clone)]
39pub struct DuplicateConfig {
40    /// Overall duplicate rate.
41    pub duplicate_rate: f64,
42    /// Exact duplicate rate (of duplicates).
43    pub exact_rate: f64,
44    /// Near duplicate rate (of duplicates).
45    pub near_rate: f64,
46    /// Fuzzy duplicate rate (of duplicates).
47    pub fuzzy_rate: f64,
48    /// Maximum days between duplicate entries.
49    pub max_date_offset_days: i64,
50    /// Fields that commonly vary in near duplicates.
51    pub varying_fields: Vec<String>,
52    /// Amount variance for near duplicates (percentage).
53    pub amount_variance: f64,
54}
55
56impl Default for DuplicateConfig {
57    fn default() -> Self {
58        Self {
59            duplicate_rate: 0.005, // 0.5% of records get duplicated
60            exact_rate: 0.3,       // 30% of duplicates are exact
61            near_rate: 0.5,        // 50% are near duplicates
62            fuzzy_rate: 0.2,       // 20% are fuzzy
63            max_date_offset_days: 5,
64            varying_fields: vec![
65                "entry_date".to_string(),
66                "created_by".to_string(),
67                "description".to_string(),
68            ],
69            amount_variance: 0.01, // 1% variance
70        }
71    }
72}
73
74/// A duplicate record with metadata.
75#[derive(Debug, Clone)]
76pub struct DuplicateRecord<T: Clone> {
77    /// The original record.
78    pub original: T,
79    /// The duplicate record.
80    pub duplicate: T,
81    /// Type of duplicate.
82    pub duplicate_type: DuplicateType,
83    /// Fields that differ.
84    pub differing_fields: Vec<String>,
85    /// Duplicate ID for tracking.
86    pub duplicate_id: String,
87}
88
89/// Trait for records that can be duplicated.
90pub trait Duplicatable: Clone {
91    /// Returns the record's unique identifier.
92    fn get_id(&self) -> String;
93
94    /// Sets a new identifier.
95    fn set_id(&mut self, id: String);
96
97    /// Gets a field value by name.
98    fn get_field(&self, field: &str) -> Option<String>;
99
100    /// Sets a field value by name.
101    fn set_field(&mut self, field: &str, value: &str);
102
103    /// Gets the amount (for amount-bearing records).
104    fn get_amount(&self) -> Option<Decimal>;
105
106    /// Sets the amount.
107    fn set_amount(&mut self, amount: Decimal);
108
109    /// Gets the date.
110    fn get_date(&self) -> Option<NaiveDate>;
111
112    /// Sets the date.
113    fn set_date(&mut self, date: NaiveDate);
114}
115
116/// Duplicate generator.
117pub struct DuplicateGenerator {
118    config: DuplicateConfig,
119    stats: DuplicateStats,
120    next_duplicate_id: u64,
121}
122
123/// Statistics for duplicate generation.
124#[derive(Debug, Clone, Default)]
125pub struct DuplicateStats {
126    /// Total records processed.
127    pub total_processed: usize,
128    /// Total duplicates created.
129    pub total_duplicates: usize,
130    /// Exact duplicates.
131    pub exact_duplicates: usize,
132    /// Near duplicates.
133    pub near_duplicates: usize,
134    /// Fuzzy duplicates.
135    pub fuzzy_duplicates: usize,
136    /// Cross-system duplicates.
137    pub cross_system_duplicates: usize,
138}
139
140impl DuplicateGenerator {
141    /// Creates a new duplicate generator.
142    pub fn new(config: DuplicateConfig) -> Self {
143        Self {
144            config,
145            stats: DuplicateStats::default(),
146            next_duplicate_id: 1,
147        }
148    }
149
150    /// Determines if a record should be duplicated.
151    pub fn should_duplicate<R: Rng>(&self, rng: &mut R) -> bool {
152        rng.gen::<f64>() < self.config.duplicate_rate
153    }
154
155    /// Creates a duplicate of a record.
156    pub fn create_duplicate<T: Duplicatable, R: Rng>(
157        &mut self,
158        record: &T,
159        rng: &mut R,
160    ) -> DuplicateRecord<T> {
161        self.stats.total_processed += 1;
162        self.stats.total_duplicates += 1;
163
164        let duplicate_type = self.select_duplicate_type(rng);
165        let mut duplicate = record.clone();
166        let mut differing_fields = Vec::new();
167
168        // Generate new ID
169        let new_id = format!("{}-DUP{}", record.get_id(), self.next_duplicate_id);
170        self.next_duplicate_id += 1;
171        duplicate.set_id(new_id);
172        differing_fields.push("id".to_string());
173
174        match &duplicate_type {
175            DuplicateType::Exact => {
176                self.stats.exact_duplicates += 1;
177                // No other changes needed
178            }
179            DuplicateType::Near { varying_fields } => {
180                self.stats.near_duplicates += 1;
181                self.apply_near_duplicate_variations(&mut duplicate, varying_fields, rng);
182                differing_fields.extend(varying_fields.clone());
183            }
184            DuplicateType::Fuzzy { similarity } => {
185                self.stats.fuzzy_duplicates += 1;
186                let varied = self.apply_fuzzy_variations(&mut duplicate, *similarity, rng);
187                differing_fields.extend(varied);
188            }
189            DuplicateType::CrossSystem {
190                source_system: _,
191                target_system,
192            } => {
193                self.stats.cross_system_duplicates += 1;
194                // Change system identifier
195                if let Some(_current_id) = duplicate.get_field("system_id") {
196                    duplicate.set_field("system_id", target_system);
197                    differing_fields.push("system_id".to_string());
198                }
199            }
200        }
201
202        let duplicate_id = format!("DUP{:08}", self.stats.total_duplicates);
203
204        DuplicateRecord {
205            original: record.clone(),
206            duplicate,
207            duplicate_type,
208            differing_fields,
209            duplicate_id,
210        }
211    }
212
213    /// Selects the type of duplicate to create.
214    fn select_duplicate_type<R: Rng>(&self, rng: &mut R) -> DuplicateType {
215        let r = rng.gen::<f64>();
216
217        if r < self.config.exact_rate {
218            DuplicateType::Exact
219        } else if r < self.config.exact_rate + self.config.near_rate {
220            DuplicateType::Near {
221                varying_fields: self.config.varying_fields.clone(),
222            }
223        } else {
224            DuplicateType::Fuzzy {
225                similarity: rng.gen_range(0.8..0.95),
226            }
227        }
228    }
229
230    /// Applies near-duplicate variations.
231    fn apply_near_duplicate_variations<T: Duplicatable, R: Rng>(
232        &self,
233        record: &mut T,
234        varying_fields: &[String],
235        rng: &mut R,
236    ) {
237        for field in varying_fields {
238            match field.as_str() {
239                "entry_date" | "date" => {
240                    if let Some(date) = record.get_date() {
241                        let offset = rng.gen_range(
242                            -self.config.max_date_offset_days..=self.config.max_date_offset_days,
243                        );
244                        record.set_date(date + Duration::days(offset));
245                    }
246                }
247                "amount" | "debit_amount" | "credit_amount" => {
248                    if let Some(amount) = record.get_amount() {
249                        let variance = 1.0
250                            + rng.gen_range(
251                                -self.config.amount_variance..self.config.amount_variance,
252                            );
253                        let new_amount =
254                            amount * Decimal::from_f64_retain(variance).unwrap_or(Decimal::ONE);
255                        record.set_amount(new_amount.round_dp(2));
256                    }
257                }
258                "description" => {
259                    if let Some(desc) = record.get_field("description") {
260                        // Add minor variation
261                        let variations = [
262                            format!("{} ", desc),
263                            format!(" {}", desc),
264                            desc.to_uppercase(),
265                            desc.to_lowercase(),
266                        ];
267                        let variation = &variations[rng.gen_range(0..variations.len())];
268                        record.set_field("description", variation);
269                    }
270                }
271                _ => {
272                    // Generic variation: add whitespace
273                    if let Some(value) = record.get_field(field) {
274                        record.set_field(field, &format!("{} ", value));
275                    }
276                }
277            }
278        }
279    }
280
281    /// Applies fuzzy variations (more significant changes).
282    fn apply_fuzzy_variations<T: Duplicatable, R: Rng>(
283        &self,
284        record: &mut T,
285        similarity: f64,
286        rng: &mut R,
287    ) -> Vec<String> {
288        let mut varied_fields = Vec::new();
289        let change_probability = 1.0 - similarity;
290
291        // Amount variation
292        if rng.gen::<f64>() < change_probability {
293            if let Some(amount) = record.get_amount() {
294                let variance = 1.0 + rng.gen_range(-0.1..0.1); // Up to 10% variation
295                let new_amount =
296                    amount * Decimal::from_f64_retain(variance).unwrap_or(Decimal::ONE);
297                record.set_amount(new_amount.round_dp(2));
298                varied_fields.push("amount".to_string());
299            }
300        }
301
302        // Date variation
303        if rng.gen::<f64>() < change_probability {
304            if let Some(date) = record.get_date() {
305                let offset = rng.gen_range(-30..=30);
306                record.set_date(date + Duration::days(offset));
307                varied_fields.push("date".to_string());
308            }
309        }
310
311        // Description variation
312        if rng.gen::<f64>() < change_probability {
313            if let Some(desc) = record.get_field("description") {
314                // Introduce typos or abbreviations
315                let abbreviated = abbreviate_text(&desc);
316                record.set_field("description", &abbreviated);
317                varied_fields.push("description".to_string());
318            }
319        }
320
321        varied_fields
322    }
323
324    /// Returns statistics.
325    pub fn stats(&self) -> &DuplicateStats {
326        &self.stats
327    }
328
329    /// Resets statistics.
330    pub fn reset_stats(&mut self) {
331        self.stats = DuplicateStats::default();
332    }
333}
334
335/// Abbreviates text by replacing common words.
336fn abbreviate_text(text: &str) -> String {
337    let abbreviations = [
338        ("Account", "Acct"),
339        ("Payment", "Pmt"),
340        ("Invoice", "Inv"),
341        ("Number", "No"),
342        ("Department", "Dept"),
343        ("Company", "Co"),
344        ("Corporation", "Corp"),
345        ("International", "Intl"),
346        ("Management", "Mgmt"),
347        ("Reference", "Ref"),
348    ];
349
350    let mut result = text.to_string();
351    for (full, abbr) in abbreviations {
352        result = result.replace(full, abbr);
353    }
354    result
355}
356
357/// Detects potential duplicates in a dataset.
358pub struct DuplicateDetector {
359    /// Similarity threshold for fuzzy matching.
360    similarity_threshold: f64,
361    /// Fields to compare.
362    comparison_fields: Vec<String>,
363}
364
365impl DuplicateDetector {
366    /// Creates a new duplicate detector.
367    pub fn new(similarity_threshold: f64, comparison_fields: Vec<String>) -> Self {
368        Self {
369            similarity_threshold,
370            comparison_fields,
371        }
372    }
373
374    /// Calculates similarity between two strings (Jaccard similarity).
375    pub fn string_similarity(&self, a: &str, b: &str) -> f64 {
376        if a == b {
377            return 1.0;
378        }
379
380        let a_chars: std::collections::HashSet<char> = a.chars().collect();
381        let b_chars: std::collections::HashSet<char> = b.chars().collect();
382
383        let intersection = a_chars.intersection(&b_chars).count();
384        let union = a_chars.union(&b_chars).count();
385
386        if union == 0 {
387            0.0
388        } else {
389            intersection as f64 / union as f64
390        }
391    }
392
393    /// Checks if two records are potential duplicates.
394    pub fn are_duplicates<T: Duplicatable>(&self, a: &T, b: &T) -> bool {
395        let mut total_similarity = 0.0;
396        let mut field_count = 0;
397
398        for field in &self.comparison_fields {
399            if let (Some(val_a), Some(val_b)) = (a.get_field(field), b.get_field(field)) {
400                total_similarity += self.string_similarity(&val_a, &val_b);
401                field_count += 1;
402            }
403        }
404
405        // Also compare amounts if available
406        if let (Some(amt_a), Some(amt_b)) = (a.get_amount(), b.get_amount()) {
407            let amt_a_f64: f64 = amt_a.try_into().unwrap_or(0.0);
408            let amt_b_f64: f64 = amt_b.try_into().unwrap_or(0.0);
409
410            if amt_a_f64.abs() > 0.0 {
411                let ratio = (amt_a_f64 - amt_b_f64).abs() / amt_a_f64.abs();
412                total_similarity += 1.0 - ratio.min(1.0);
413                field_count += 1;
414            }
415        }
416
417        if field_count == 0 {
418            return false;
419        }
420
421        let avg_similarity = total_similarity / field_count as f64;
422        avg_similarity >= self.similarity_threshold
423    }
424
425    /// Finds all duplicate pairs in a collection.
426    pub fn find_duplicates<T: Duplicatable>(&self, records: &[T]) -> Vec<(usize, usize, f64)> {
427        let mut duplicates = Vec::new();
428
429        for i in 0..records.len() {
430            for j in (i + 1)..records.len() {
431                if self.are_duplicates(&records[i], &records[j]) {
432                    let mut similarity = 0.0;
433                    let mut count = 0;
434
435                    for field in &self.comparison_fields {
436                        if let (Some(a), Some(b)) =
437                            (records[i].get_field(field), records[j].get_field(field))
438                        {
439                            similarity += self.string_similarity(&a, &b);
440                            count += 1;
441                        }
442                    }
443
444                    if count > 0 {
445                        duplicates.push((i, j, similarity / count as f64));
446                    }
447                }
448            }
449        }
450
451        duplicates
452    }
453}
454
455#[cfg(test)]
456mod tests {
457    use super::*;
458
459    // Simple test struct implementing Duplicatable
460    #[derive(Clone)]
461    struct TestRecord {
462        id: String,
463        description: String,
464        amount: Decimal,
465        date: NaiveDate,
466    }
467
468    impl Duplicatable for TestRecord {
469        fn get_id(&self) -> String {
470            self.id.clone()
471        }
472
473        fn set_id(&mut self, id: String) {
474            self.id = id;
475        }
476
477        fn get_field(&self, field: &str) -> Option<String> {
478            match field {
479                "description" => Some(self.description.clone()),
480                "id" => Some(self.id.clone()),
481                _ => None,
482            }
483        }
484
485        fn set_field(&mut self, field: &str, value: &str) {
486            if field == "description" {
487                self.description = value.to_string();
488            }
489        }
490
491        fn get_amount(&self) -> Option<Decimal> {
492            Some(self.amount)
493        }
494
495        fn set_amount(&mut self, amount: Decimal) {
496            self.amount = amount;
497        }
498
499        fn get_date(&self) -> Option<NaiveDate> {
500            Some(self.date)
501        }
502
503        fn set_date(&mut self, date: NaiveDate) {
504            self.date = date;
505        }
506    }
507
508    #[test]
509    fn test_duplicate_generation() {
510        use rand::SeedableRng;
511        use rand_chacha::ChaCha8Rng;
512        use rust_decimal_macros::dec;
513
514        let config = DuplicateConfig::default();
515        let mut generator = DuplicateGenerator::new(config);
516        let mut rng = ChaCha8Rng::seed_from_u64(42);
517
518        let record = TestRecord {
519            id: "JE001".to_string(),
520            description: "Test Entry".to_string(),
521            amount: dec!(1000),
522            date: NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
523        };
524
525        let duplicate = generator.create_duplicate(&record, &mut rng);
526
527        assert_ne!(duplicate.duplicate.get_id(), record.get_id());
528        assert_eq!(generator.stats().total_duplicates, 1);
529    }
530
531    #[test]
532    fn test_string_similarity() {
533        let detector = DuplicateDetector::new(0.8, vec!["description".to_string()]);
534
535        assert_eq!(detector.string_similarity("hello", "hello"), 1.0);
536        assert!(detector.string_similarity("hello", "helo") > 0.8);
537        assert!(detector.string_similarity("abc", "xyz") < 0.5);
538    }
539
540    #[test]
541    fn test_abbreviate_text() {
542        let text = "Account Payment Invoice";
543        let abbreviated = abbreviate_text(text);
544        assert_eq!(abbreviated, "Acct Pmt Inv");
545    }
546}