Skip to main content

datasynth_generators/data_quality/
duplicates.rs

1//! Duplicate record generation for data quality simulation.
2//!
3//! Simulates realistic duplicate scenarios:
4//! - Exact duplicates (complete record duplication)
5//! - Near duplicates (minor variations)
6//! - Fuzzy duplicates (similar but not identical)
7//! - Cross-system duplicates (different identifiers, same entity)
8
9use chrono::{Duration, NaiveDate};
10use rand::Rng;
11use rust_decimal::Decimal;
12
13/// Type of duplicate.
14#[derive(Debug, Clone, PartialEq)]
15pub enum DuplicateType {
16    /// Complete exact duplicate.
17    Exact,
18    /// Near duplicate with minor variations.
19    Near {
20        /// Fields that vary.
21        varying_fields: Vec<String>,
22    },
23    /// Fuzzy duplicate with significant but recognizable differences.
24    Fuzzy {
25        /// Similarity threshold (0.0 - 1.0).
26        similarity: f64,
27    },
28    /// Cross-system duplicate (same entity, different identifiers).
29    CrossSystem {
30        /// Source system identifier.
31        source_system: String,
32        /// Target system identifier.
33        target_system: String,
34    },
35}
36
37/// Configuration for duplicate generation.
38#[derive(Debug, Clone)]
39pub struct DuplicateConfig {
40    /// Overall duplicate rate.
41    pub duplicate_rate: f64,
42    /// Exact duplicate rate (of duplicates).
43    pub exact_rate: f64,
44    /// Near duplicate rate (of duplicates).
45    pub near_rate: f64,
46    /// Fuzzy duplicate rate (of duplicates).
47    pub fuzzy_rate: f64,
48    /// Maximum days between duplicate entries.
49    pub max_date_offset_days: i64,
50    /// Fields that commonly vary in near duplicates.
51    pub varying_fields: Vec<String>,
52    /// Amount variance for near duplicates (percentage).
53    pub amount_variance: f64,
54}
55
56impl Default for DuplicateConfig {
57    fn default() -> Self {
58        Self {
59            duplicate_rate: 0.005, // 0.5% of records get duplicated
60            exact_rate: 0.3,       // 30% of duplicates are exact
61            near_rate: 0.5,        // 50% are near duplicates
62            fuzzy_rate: 0.2,       // 20% are fuzzy
63            max_date_offset_days: 5,
64            varying_fields: vec![
65                "entry_date".to_string(),
66                "created_by".to_string(),
67                "description".to_string(),
68            ],
69            amount_variance: 0.01, // 1% variance
70        }
71    }
72}
73
74/// A duplicate record with metadata.
75#[derive(Debug, Clone)]
76pub struct DuplicateRecord<T: Clone> {
77    /// The original record.
78    pub original: T,
79    /// The duplicate record.
80    pub duplicate: T,
81    /// Type of duplicate.
82    pub duplicate_type: DuplicateType,
83    /// Fields that differ.
84    pub differing_fields: Vec<String>,
85    /// Duplicate ID for tracking.
86    pub duplicate_id: String,
87}
88
89/// Trait for records that can be duplicated.
90pub trait Duplicatable: Clone {
91    /// Returns the record's unique identifier.
92    fn get_id(&self) -> String;
93
94    /// Sets a new identifier.
95    fn set_id(&mut self, id: String);
96
97    /// Gets a field value by name.
98    fn get_field(&self, field: &str) -> Option<String>;
99
100    /// Sets a field value by name.
101    fn set_field(&mut self, field: &str, value: &str);
102
103    /// Gets the amount (for amount-bearing records).
104    fn get_amount(&self) -> Option<Decimal>;
105
106    /// Sets the amount.
107    fn set_amount(&mut self, amount: Decimal);
108
109    /// Gets the date.
110    fn get_date(&self) -> Option<NaiveDate>;
111
112    /// Sets the date.
113    fn set_date(&mut self, date: NaiveDate);
114}
115
116/// Duplicate generator.
117pub struct DuplicateGenerator {
118    config: DuplicateConfig,
119    stats: DuplicateStats,
120    next_duplicate_id: u64,
121}
122
123/// Statistics for duplicate generation.
124#[derive(Debug, Clone, Default)]
125pub struct DuplicateStats {
126    /// Total records processed.
127    pub total_processed: usize,
128    /// Total duplicates created.
129    pub total_duplicates: usize,
130    /// Exact duplicates.
131    pub exact_duplicates: usize,
132    /// Near duplicates.
133    pub near_duplicates: usize,
134    /// Fuzzy duplicates.
135    pub fuzzy_duplicates: usize,
136    /// Cross-system duplicates.
137    pub cross_system_duplicates: usize,
138}
139
140impl DuplicateGenerator {
141    /// Creates a new duplicate generator.
142    pub fn new(config: DuplicateConfig) -> Self {
143        Self {
144            config,
145            stats: DuplicateStats::default(),
146            next_duplicate_id: 1,
147        }
148    }
149
150    /// Determines if a record should be duplicated.
151    pub fn should_duplicate<R: Rng>(&self, rng: &mut R) -> bool {
152        rng.gen::<f64>() < self.config.duplicate_rate
153    }
154
155    /// Creates a duplicate of a record.
156    pub fn create_duplicate<T: Duplicatable, R: Rng>(
157        &mut self,
158        record: &T,
159        rng: &mut R,
160    ) -> DuplicateRecord<T> {
161        self.stats.total_processed += 1;
162        self.stats.total_duplicates += 1;
163
164        let duplicate_type = self.select_duplicate_type(rng);
165        let mut duplicate = record.clone();
166        let mut differing_fields = Vec::new();
167
168        // Generate new ID
169        let new_id = format!("{}-DUP{}", record.get_id(), self.next_duplicate_id);
170        self.next_duplicate_id += 1;
171        duplicate.set_id(new_id);
172        differing_fields.push("id".to_string());
173
174        match &duplicate_type {
175            DuplicateType::Exact => {
176                self.stats.exact_duplicates += 1;
177                // No other changes needed
178            }
179            DuplicateType::Near { varying_fields } => {
180                self.stats.near_duplicates += 1;
181                self.apply_near_duplicate_variations(&mut duplicate, varying_fields, rng);
182                differing_fields.extend(varying_fields.clone());
183            }
184            DuplicateType::Fuzzy { similarity } => {
185                self.stats.fuzzy_duplicates += 1;
186                let varied = self.apply_fuzzy_variations(&mut duplicate, *similarity, rng);
187                differing_fields.extend(varied);
188            }
189            DuplicateType::CrossSystem {
190                source_system: _,
191                target_system,
192            } => {
193                self.stats.cross_system_duplicates += 1;
194                // Change system identifier
195                if let Some(_current_id) = duplicate.get_field("system_id") {
196                    duplicate.set_field("system_id", target_system);
197                    differing_fields.push("system_id".to_string());
198                }
199            }
200        }
201
202        let duplicate_id = format!("DUP{:08}", self.stats.total_duplicates);
203
204        DuplicateRecord {
205            original: record.clone(),
206            duplicate,
207            duplicate_type,
208            differing_fields,
209            duplicate_id,
210        }
211    }
212
213    /// Selects the type of duplicate to create.
214    fn select_duplicate_type<R: Rng>(&self, rng: &mut R) -> DuplicateType {
215        let r = rng.gen::<f64>();
216
217        if r < self.config.exact_rate {
218            DuplicateType::Exact
219        } else if r < self.config.exact_rate + self.config.near_rate {
220            DuplicateType::Near {
221                varying_fields: self.config.varying_fields.clone(),
222            }
223        } else {
224            DuplicateType::Fuzzy {
225                similarity: rng.gen_range(0.8..0.95),
226            }
227        }
228    }
229
230    /// Applies near-duplicate variations.
231    fn apply_near_duplicate_variations<T: Duplicatable, R: Rng>(
232        &self,
233        record: &mut T,
234        varying_fields: &[String],
235        rng: &mut R,
236    ) {
237        for field in varying_fields {
238            match field.as_str() {
239                "entry_date" | "date" => {
240                    if let Some(date) = record.get_date() {
241                        let offset = rng.gen_range(
242                            -self.config.max_date_offset_days..=self.config.max_date_offset_days,
243                        );
244                        record.set_date(date + Duration::days(offset));
245                    }
246                }
247                "amount" | "debit_amount" | "credit_amount" => {
248                    if let Some(amount) = record.get_amount() {
249                        let variance = 1.0
250                            + rng.gen_range(
251                                -self.config.amount_variance..self.config.amount_variance,
252                            );
253                        let new_amount =
254                            amount * Decimal::from_f64_retain(variance).unwrap_or(Decimal::ONE);
255                        record.set_amount(new_amount.round_dp(2));
256                    }
257                }
258                "description" => {
259                    if let Some(desc) = record.get_field("description") {
260                        // Add minor variation
261                        let variations = [
262                            format!("{} ", desc),
263                            format!(" {}", desc),
264                            desc.to_uppercase(),
265                            desc.to_lowercase(),
266                        ];
267                        let variation = &variations[rng.gen_range(0..variations.len())];
268                        record.set_field("description", variation);
269                    }
270                }
271                _ => {
272                    // Generic variation: add whitespace
273                    if let Some(value) = record.get_field(field) {
274                        record.set_field(field, &format!("{} ", value));
275                    }
276                }
277            }
278        }
279    }
280
281    /// Applies fuzzy variations (more significant changes).
282    fn apply_fuzzy_variations<T: Duplicatable, R: Rng>(
283        &self,
284        record: &mut T,
285        similarity: f64,
286        rng: &mut R,
287    ) -> Vec<String> {
288        let mut varied_fields = Vec::new();
289        let change_probability = 1.0 - similarity;
290
291        // Amount variation
292        if rng.gen::<f64>() < change_probability {
293            if let Some(amount) = record.get_amount() {
294                let variance = 1.0 + rng.gen_range(-0.1..0.1); // Up to 10% variation
295                let new_amount =
296                    amount * Decimal::from_f64_retain(variance).unwrap_or(Decimal::ONE);
297                record.set_amount(new_amount.round_dp(2));
298                varied_fields.push("amount".to_string());
299            }
300        }
301
302        // Date variation
303        if rng.gen::<f64>() < change_probability {
304            if let Some(date) = record.get_date() {
305                let offset = rng.gen_range(-30..=30);
306                record.set_date(date + Duration::days(offset));
307                varied_fields.push("date".to_string());
308            }
309        }
310
311        // Description variation
312        if rng.gen::<f64>() < change_probability {
313            if let Some(desc) = record.get_field("description") {
314                // Introduce typos or abbreviations
315                let abbreviated = abbreviate_text(&desc);
316                record.set_field("description", &abbreviated);
317                varied_fields.push("description".to_string());
318            }
319        }
320
321        varied_fields
322    }
323
324    /// Returns statistics.
325    pub fn stats(&self) -> &DuplicateStats {
326        &self.stats
327    }
328
329    /// Resets statistics.
330    pub fn reset_stats(&mut self) {
331        self.stats = DuplicateStats::default();
332    }
333}
334
335/// Abbreviates text by replacing common words.
336fn abbreviate_text(text: &str) -> String {
337    let abbreviations = [
338        ("Account", "Acct"),
339        ("Payment", "Pmt"),
340        ("Invoice", "Inv"),
341        ("Number", "No"),
342        ("Department", "Dept"),
343        ("Company", "Co"),
344        ("Corporation", "Corp"),
345        ("International", "Intl"),
346        ("Management", "Mgmt"),
347        ("Reference", "Ref"),
348    ];
349
350    let mut result = text.to_string();
351    for (full, abbr) in abbreviations {
352        result = result.replace(full, abbr);
353    }
354    result
355}
356
357/// Detects potential duplicates in a dataset.
358pub struct DuplicateDetector {
359    /// Similarity threshold for fuzzy matching.
360    similarity_threshold: f64,
361    /// Fields to compare.
362    comparison_fields: Vec<String>,
363}
364
365impl DuplicateDetector {
366    /// Creates a new duplicate detector.
367    pub fn new(similarity_threshold: f64, comparison_fields: Vec<String>) -> Self {
368        Self {
369            similarity_threshold,
370            comparison_fields,
371        }
372    }
373
374    /// Calculates similarity between two strings (Jaccard similarity).
375    pub fn string_similarity(&self, a: &str, b: &str) -> f64 {
376        if a == b {
377            return 1.0;
378        }
379
380        let a_chars: std::collections::HashSet<char> = a.chars().collect();
381        let b_chars: std::collections::HashSet<char> = b.chars().collect();
382
383        let intersection = a_chars.intersection(&b_chars).count();
384        let union = a_chars.union(&b_chars).count();
385
386        if union == 0 {
387            0.0
388        } else {
389            intersection as f64 / union as f64
390        }
391    }
392
393    /// Checks if two records are potential duplicates.
394    pub fn are_duplicates<T: Duplicatable>(&self, a: &T, b: &T) -> bool {
395        let mut total_similarity = 0.0;
396        let mut field_count = 0;
397
398        for field in &self.comparison_fields {
399            if let (Some(val_a), Some(val_b)) = (a.get_field(field), b.get_field(field)) {
400                total_similarity += self.string_similarity(&val_a, &val_b);
401                field_count += 1;
402            }
403        }
404
405        // Also compare amounts if available
406        if let (Some(amt_a), Some(amt_b)) = (a.get_amount(), b.get_amount()) {
407            let amt_a_f64: f64 = amt_a.try_into().unwrap_or(0.0);
408            let amt_b_f64: f64 = amt_b.try_into().unwrap_or(0.0);
409
410            if amt_a_f64.abs() > 0.0 {
411                let ratio = (amt_a_f64 - amt_b_f64).abs() / amt_a_f64.abs();
412                total_similarity += 1.0 - ratio.min(1.0);
413                field_count += 1;
414            }
415        }
416
417        if field_count == 0 {
418            return false;
419        }
420
421        let avg_similarity = total_similarity / field_count as f64;
422        avg_similarity >= self.similarity_threshold
423    }
424
425    /// Finds all duplicate pairs in a collection.
426    pub fn find_duplicates<T: Duplicatable>(&self, records: &[T]) -> Vec<(usize, usize, f64)> {
427        let mut duplicates = Vec::new();
428
429        for i in 0..records.len() {
430            for j in (i + 1)..records.len() {
431                if self.are_duplicates(&records[i], &records[j]) {
432                    let mut similarity = 0.0;
433                    let mut count = 0;
434
435                    for field in &self.comparison_fields {
436                        if let (Some(a), Some(b)) =
437                            (records[i].get_field(field), records[j].get_field(field))
438                        {
439                            similarity += self.string_similarity(&a, &b);
440                            count += 1;
441                        }
442                    }
443
444                    if count > 0 {
445                        duplicates.push((i, j, similarity / count as f64));
446                    }
447                }
448            }
449        }
450
451        duplicates
452    }
453}
454
455#[cfg(test)]
456#[allow(clippy::unwrap_used)]
457mod tests {
458    use super::*;
459
460    // Simple test struct implementing Duplicatable
461    #[derive(Clone)]
462    struct TestRecord {
463        id: String,
464        description: String,
465        amount: Decimal,
466        date: NaiveDate,
467    }
468
469    impl Duplicatable for TestRecord {
470        fn get_id(&self) -> String {
471            self.id.clone()
472        }
473
474        fn set_id(&mut self, id: String) {
475            self.id = id;
476        }
477
478        fn get_field(&self, field: &str) -> Option<String> {
479            match field {
480                "description" => Some(self.description.clone()),
481                "id" => Some(self.id.clone()),
482                _ => None,
483            }
484        }
485
486        fn set_field(&mut self, field: &str, value: &str) {
487            if field == "description" {
488                self.description = value.to_string();
489            }
490        }
491
492        fn get_amount(&self) -> Option<Decimal> {
493            Some(self.amount)
494        }
495
496        fn set_amount(&mut self, amount: Decimal) {
497            self.amount = amount;
498        }
499
500        fn get_date(&self) -> Option<NaiveDate> {
501            Some(self.date)
502        }
503
504        fn set_date(&mut self, date: NaiveDate) {
505            self.date = date;
506        }
507    }
508
509    #[test]
510    fn test_duplicate_generation() {
511        use rand::SeedableRng;
512        use rand_chacha::ChaCha8Rng;
513        use rust_decimal_macros::dec;
514
515        let config = DuplicateConfig::default();
516        let mut generator = DuplicateGenerator::new(config);
517        let mut rng = ChaCha8Rng::seed_from_u64(42);
518
519        let record = TestRecord {
520            id: "JE001".to_string(),
521            description: "Test Entry".to_string(),
522            amount: dec!(1000),
523            date: NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
524        };
525
526        let duplicate = generator.create_duplicate(&record, &mut rng);
527
528        assert_ne!(duplicate.duplicate.get_id(), record.get_id());
529        assert_eq!(generator.stats().total_duplicates, 1);
530    }
531
532    #[test]
533    fn test_string_similarity() {
534        let detector = DuplicateDetector::new(0.8, vec!["description".to_string()]);
535
536        assert_eq!(detector.string_similarity("hello", "hello"), 1.0);
537        assert!(detector.string_similarity("hello", "helo") > 0.8);
538        assert!(detector.string_similarity("abc", "xyz") < 0.5);
539    }
540
541    #[test]
542    fn test_abbreviate_text() {
543        let text = "Account Payment Invoice";
544        let abbreviated = abbreviate_text(text);
545        assert_eq!(abbreviated, "Acct Pmt Inv");
546    }
547}