Skip to main content

datasynth_generators/data_quality/
injector.rs

1//! Main data quality injector coordinating all quality issues.
2//!
3//! This module provides a unified interface for introducing various
4//! data quality issues into synthetic data.
5
6use chrono::NaiveDate;
7use rand::Rng;
8use rand::SeedableRng;
9use rand_chacha::ChaCha8Rng;
10use rust_decimal::Decimal;
11use std::collections::HashMap;
12
13use super::duplicates::{DuplicateConfig, DuplicateGenerator, DuplicateStats};
14use super::format_variations::{
15    AmountFormat, DateFormat, FormatVariationConfig, FormatVariationInjector, FormatVariationStats,
16};
17use super::missing_values::{MissingValueConfig, MissingValueInjector, MissingValueStats};
18use super::typos::{introduce_encoding_issue, EncodingIssue, TypoConfig, TypoGenerator, TypoStats};
19
20/// Configuration for the data quality injector.
21#[derive(Debug, Clone)]
22pub struct DataQualityConfig {
23    /// Enable missing value injection.
24    pub enable_missing_values: bool,
25    /// Missing value configuration.
26    pub missing_values: MissingValueConfig,
27    /// Enable format variations.
28    pub enable_format_variations: bool,
29    /// Format variation configuration.
30    pub format_variations: FormatVariationConfig,
31    /// Enable duplicates.
32    pub enable_duplicates: bool,
33    /// Duplicate configuration.
34    pub duplicates: DuplicateConfig,
35    /// Enable typos.
36    pub enable_typos: bool,
37    /// Typo configuration.
38    pub typos: TypoConfig,
39    /// Enable encoding issues.
40    pub enable_encoding_issues: bool,
41    /// Encoding issue rate.
42    pub encoding_issue_rate: f64,
43    /// Random seed for reproducibility.
44    pub seed: u64,
45    /// Track detailed statistics.
46    pub track_statistics: bool,
47}
48
49impl Default for DataQualityConfig {
50    fn default() -> Self {
51        Self {
52            enable_missing_values: true,
53            missing_values: MissingValueConfig::default(),
54            enable_format_variations: true,
55            format_variations: FormatVariationConfig::default(),
56            enable_duplicates: true,
57            duplicates: DuplicateConfig::default(),
58            enable_typos: true,
59            typos: TypoConfig::default(),
60            enable_encoding_issues: false, // Off by default (can cause issues)
61            encoding_issue_rate: 0.001,
62            seed: 42,
63            track_statistics: true,
64        }
65    }
66}
67
68impl DataQualityConfig {
69    /// Creates a minimal configuration (low rates).
70    pub fn minimal() -> Self {
71        Self {
72            missing_values: MissingValueConfig {
73                global_rate: 0.005,
74                ..Default::default()
75            },
76            format_variations: FormatVariationConfig {
77                date_variation_rate: 0.01,
78                amount_variation_rate: 0.01,
79                identifier_variation_rate: 0.005,
80                text_variation_rate: 0.01,
81                ..Default::default()
82            },
83            duplicates: DuplicateConfig {
84                duplicate_rate: 0.001,
85                ..Default::default()
86            },
87            typos: TypoConfig {
88                char_error_rate: 0.001,
89                ..Default::default()
90            },
91            ..Default::default()
92        }
93    }
94
95    /// Creates a high-variation configuration (for stress testing).
96    pub fn high_variation() -> Self {
97        Self {
98            missing_values: MissingValueConfig {
99                global_rate: 0.05,
100                ..Default::default()
101            },
102            format_variations: FormatVariationConfig {
103                date_variation_rate: 0.2,
104                amount_variation_rate: 0.1,
105                identifier_variation_rate: 0.1,
106                text_variation_rate: 0.2,
107                ..Default::default()
108            },
109            duplicates: DuplicateConfig {
110                duplicate_rate: 0.02,
111                ..Default::default()
112            },
113            typos: TypoConfig {
114                char_error_rate: 0.02,
115                ..Default::default()
116            },
117            enable_encoding_issues: true,
118            encoding_issue_rate: 0.01,
119            ..Default::default()
120        }
121    }
122}
123
124/// Combined statistics for all data quality issues.
125#[derive(Debug, Clone, Default)]
126pub struct DataQualityStats {
127    /// Missing value statistics.
128    pub missing_values: MissingValueStats,
129    /// Format variation statistics.
130    pub format_variations: FormatVariationStats,
131    /// Duplicate statistics.
132    pub duplicates: DuplicateStats,
133    /// Typo statistics.
134    pub typos: TypoStats,
135    /// Encoding issues injected.
136    pub encoding_issues: usize,
137    /// Total records processed.
138    pub total_records: usize,
139    /// Records with any quality issue.
140    pub records_with_issues: usize,
141}
142
143impl DataQualityStats {
144    /// Returns the overall issue rate.
145    pub fn overall_issue_rate(&self) -> f64 {
146        if self.total_records == 0 {
147            0.0
148        } else {
149            self.records_with_issues as f64 / self.total_records as f64
150        }
151    }
152
153    /// Returns a summary of issues.
154    pub fn summary(&self) -> HashMap<String, usize> {
155        let mut summary = HashMap::new();
156        summary.insert(
157            "missing_values".to_string(),
158            self.missing_values.total_missing,
159        );
160        summary.insert(
161            "format_variations".to_string(),
162            self.format_variations.date_variations
163                + self.format_variations.amount_variations
164                + self.format_variations.identifier_variations
165                + self.format_variations.text_variations,
166        );
167        summary.insert("duplicates".to_string(), self.duplicates.total_duplicates);
168        summary.insert("typos".to_string(), self.typos.total_typos);
169        summary.insert("encoding_issues".to_string(), self.encoding_issues);
170        summary
171    }
172}
173
174/// A data quality issue record.
175#[derive(Debug, Clone)]
176pub struct QualityIssue {
177    /// Unique issue identifier.
178    pub issue_id: String,
179    /// Type of issue.
180    pub issue_type: QualityIssueType,
181    /// Record identifier.
182    pub record_id: String,
183    /// Field affected.
184    pub field: Option<String>,
185    /// Original value (if available).
186    pub original_value: Option<String>,
187    /// Modified value (if available).
188    pub modified_value: Option<String>,
189    /// Description.
190    pub description: String,
191}
192
193/// Type of quality issue.
194#[derive(Debug, Clone, PartialEq)]
195pub enum QualityIssueType {
196    /// Missing value.
197    MissingValue,
198    /// Date format variation.
199    DateFormatVariation,
200    /// Amount format variation.
201    AmountFormatVariation,
202    /// Identifier format variation.
203    IdentifierFormatVariation,
204    /// Text format variation.
205    TextFormatVariation,
206    /// Exact duplicate.
207    ExactDuplicate,
208    /// Near duplicate.
209    NearDuplicate,
210    /// Fuzzy duplicate.
211    FuzzyDuplicate,
212    /// Typo.
213    Typo,
214    /// Encoding issue.
215    EncodingIssue,
216}
217
218/// Main data quality injector.
219pub struct DataQualityInjector {
220    config: DataQualityConfig,
221    rng: ChaCha8Rng,
222    missing_value_injector: MissingValueInjector,
223    format_injector: FormatVariationInjector,
224    duplicate_generator: DuplicateGenerator,
225    typo_generator: TypoGenerator,
226    stats: DataQualityStats,
227    issues: Vec<QualityIssue>,
228    next_issue_id: u64,
229}
230
231impl DataQualityInjector {
232    /// Creates a new data quality injector.
233    pub fn new(config: DataQualityConfig) -> Self {
234        let rng = ChaCha8Rng::seed_from_u64(config.seed);
235        let missing_value_injector = MissingValueInjector::new(config.missing_values.clone());
236        let format_injector = FormatVariationInjector::new(config.format_variations.clone());
237        let duplicate_generator = DuplicateGenerator::new(config.duplicates.clone());
238        let typo_generator = TypoGenerator::new(config.typos.clone());
239
240        Self {
241            config,
242            rng,
243            missing_value_injector,
244            format_injector,
245            duplicate_generator,
246            typo_generator,
247            stats: DataQualityStats::default(),
248            issues: Vec::new(),
249            next_issue_id: 1,
250        }
251    }
252
253    /// Processes a text field, potentially introducing quality issues.
254    pub fn process_text_field(
255        &mut self,
256        field: &str,
257        value: &str,
258        record_id: &str,
259        context: &HashMap<String, String>,
260    ) -> Option<String> {
261        let mut result = value.to_string();
262        let mut had_issue = false;
263
264        // Check for missing value
265        if self.config.enable_missing_values
266            && self.missing_value_injector.should_be_missing(
267                field,
268                Some(value),
269                context,
270                &mut self.rng,
271            )
272        {
273            let issue_id = self.next_issue_id();
274            self.record_issue(QualityIssue {
275                issue_id,
276                issue_type: QualityIssueType::MissingValue,
277                record_id: record_id.to_string(),
278                field: Some(field.to_string()),
279                original_value: Some(value.to_string()),
280                modified_value: None,
281                description: format!("Field '{}' set to missing", field),
282            });
283            return None;
284        }
285
286        // Apply typos
287        if self.config.enable_typos && !self.typo_generator.is_protected(field) {
288            let with_typos = self.typo_generator.introduce_typos(&result, &mut self.rng);
289            if with_typos != result {
290                let issue_id = self.next_issue_id();
291                self.record_issue(QualityIssue {
292                    issue_id,
293                    issue_type: QualityIssueType::Typo,
294                    record_id: record_id.to_string(),
295                    field: Some(field.to_string()),
296                    original_value: Some(result.clone()),
297                    modified_value: Some(with_typos.clone()),
298                    description: format!("Typo introduced in field '{}'", field),
299                });
300                result = with_typos;
301                had_issue = true;
302            }
303        }
304
305        // Apply format variations
306        if self.config.enable_format_variations {
307            let varied = self.format_injector.vary_text(&result, &mut self.rng);
308            if varied != result {
309                let issue_id = self.next_issue_id();
310                self.record_issue(QualityIssue {
311                    issue_id,
312                    issue_type: QualityIssueType::TextFormatVariation,
313                    record_id: record_id.to_string(),
314                    field: Some(field.to_string()),
315                    original_value: Some(result.clone()),
316                    modified_value: Some(varied.clone()),
317                    description: format!("Format variation in field '{}'", field),
318                });
319                result = varied;
320                had_issue = true;
321            }
322        }
323
324        // Apply encoding issues
325        if self.config.enable_encoding_issues
326            && self.rng.gen::<f64>() < self.config.encoding_issue_rate
327        {
328            let issues = [
329                EncodingIssue::Mojibake,
330                EncodingIssue::MissingChars,
331                EncodingIssue::HTMLEntities,
332            ];
333            let issue = issues[self.rng.gen_range(0..issues.len())];
334            let with_encoding = introduce_encoding_issue(&result, issue, &mut self.rng);
335
336            if with_encoding != result {
337                let issue_id = self.next_issue_id();
338                self.record_issue(QualityIssue {
339                    issue_id,
340                    issue_type: QualityIssueType::EncodingIssue,
341                    record_id: record_id.to_string(),
342                    field: Some(field.to_string()),
343                    original_value: Some(result.clone()),
344                    modified_value: Some(with_encoding.clone()),
345                    description: format!("Encoding issue ({:?}) in field '{}'", issue, field),
346                });
347                result = with_encoding;
348                had_issue = true;
349                self.stats.encoding_issues += 1;
350            }
351        }
352
353        if had_issue {
354            self.stats.records_with_issues += 1;
355        }
356
357        Some(result)
358    }
359
360    /// Processes a date field, potentially introducing format variations.
361    pub fn process_date_field(
362        &mut self,
363        field: &str,
364        date: NaiveDate,
365        record_id: &str,
366        context: &HashMap<String, String>,
367    ) -> Option<String> {
368        // Check for missing value
369        if self.config.enable_missing_values
370            && self.missing_value_injector.should_be_missing(
371                field,
372                Some(&date.to_string()),
373                context,
374                &mut self.rng,
375            )
376        {
377            let issue_id = self.next_issue_id();
378            self.record_issue(QualityIssue {
379                issue_id,
380                issue_type: QualityIssueType::MissingValue,
381                record_id: record_id.to_string(),
382                field: Some(field.to_string()),
383                original_value: Some(date.to_string()),
384                modified_value: None,
385                description: format!("Date field '{}' set to missing", field),
386            });
387            return None;
388        }
389
390        // Apply format variations
391        if self.config.enable_format_variations {
392            let formatted = self.format_injector.vary_date(date, &mut self.rng);
393            let standard = DateFormat::ISO.format(date);
394
395            if formatted != standard {
396                let issue_id = self.next_issue_id();
397                self.record_issue(QualityIssue {
398                    issue_id,
399                    issue_type: QualityIssueType::DateFormatVariation,
400                    record_id: record_id.to_string(),
401                    field: Some(field.to_string()),
402                    original_value: Some(standard),
403                    modified_value: Some(formatted.clone()),
404                    description: format!("Date format variation in field '{}'", field),
405                });
406            }
407
408            return Some(formatted);
409        }
410
411        Some(DateFormat::ISO.format(date))
412    }
413
414    /// Processes an amount field, potentially introducing format variations.
415    pub fn process_amount_field(
416        &mut self,
417        field: &str,
418        amount: Decimal,
419        record_id: &str,
420        context: &HashMap<String, String>,
421    ) -> Option<String> {
422        // Check for missing value
423        if self.config.enable_missing_values
424            && self.missing_value_injector.should_be_missing(
425                field,
426                Some(&amount.to_string()),
427                context,
428                &mut self.rng,
429            )
430        {
431            let issue_id = self.next_issue_id();
432            self.record_issue(QualityIssue {
433                issue_id,
434                issue_type: QualityIssueType::MissingValue,
435                record_id: record_id.to_string(),
436                field: Some(field.to_string()),
437                original_value: Some(amount.to_string()),
438                modified_value: None,
439                description: format!("Amount field '{}' set to missing", field),
440            });
441            return None;
442        }
443
444        // Apply format variations
445        if self.config.enable_format_variations {
446            let formatted = self.format_injector.vary_amount(amount, &mut self.rng);
447            let standard = AmountFormat::Plain.format(amount);
448
449            if formatted != standard {
450                let issue_id = self.next_issue_id();
451                self.record_issue(QualityIssue {
452                    issue_id,
453                    issue_type: QualityIssueType::AmountFormatVariation,
454                    record_id: record_id.to_string(),
455                    field: Some(field.to_string()),
456                    original_value: Some(standard),
457                    modified_value: Some(formatted.clone()),
458                    description: format!("Amount format variation in field '{}'", field),
459                });
460            }
461
462            return Some(formatted);
463        }
464
465        Some(AmountFormat::Plain.format(amount))
466    }
467
468    /// Processes an identifier field, potentially introducing variations.
469    pub fn process_identifier_field(
470        &mut self,
471        field: &str,
472        id: &str,
473        record_id: &str,
474        context: &HashMap<String, String>,
475    ) -> Option<String> {
476        // Check for missing value (rare for identifiers)
477        if self.config.enable_missing_values
478            && self.missing_value_injector.should_be_missing(
479                field,
480                Some(id),
481                context,
482                &mut self.rng,
483            )
484        {
485            let issue_id = self.next_issue_id();
486            self.record_issue(QualityIssue {
487                issue_id,
488                issue_type: QualityIssueType::MissingValue,
489                record_id: record_id.to_string(),
490                field: Some(field.to_string()),
491                original_value: Some(id.to_string()),
492                modified_value: None,
493                description: format!("Identifier field '{}' set to missing", field),
494            });
495            return None;
496        }
497
498        // Apply format variations
499        if self.config.enable_format_variations {
500            let varied = self.format_injector.vary_identifier(id, &mut self.rng);
501
502            if varied != id {
503                let issue_id = self.next_issue_id();
504                self.record_issue(QualityIssue {
505                    issue_id,
506                    issue_type: QualityIssueType::IdentifierFormatVariation,
507                    record_id: record_id.to_string(),
508                    field: Some(field.to_string()),
509                    original_value: Some(id.to_string()),
510                    modified_value: Some(varied.clone()),
511                    description: format!("Identifier format variation in field '{}'", field),
512                });
513            }
514
515            return Some(varied);
516        }
517
518        Some(id.to_string())
519    }
520
521    /// Determines if a record should be duplicated.
522    pub fn should_duplicate(&mut self) -> bool {
523        self.config.enable_duplicates && self.duplicate_generator.should_duplicate(&mut self.rng)
524    }
525
526    /// Records a quality issue.
527    fn record_issue(&mut self, issue: QualityIssue) {
528        if self.config.track_statistics {
529            self.issues.push(issue);
530        }
531    }
532
533    /// Generates the next issue ID.
534    fn next_issue_id(&mut self) -> String {
535        let id = format!("QI{:08}", self.next_issue_id);
536        self.next_issue_id += 1;
537        id
538    }
539
540    /// Returns statistics.
541    pub fn stats(&self) -> &DataQualityStats {
542        &self.stats
543    }
544
545    /// Returns all recorded issues.
546    pub fn issues(&self) -> &[QualityIssue] {
547        &self.issues
548    }
549
550    /// Returns issues for a specific record.
551    pub fn issues_for_record(&self, record_id: &str) -> Vec<&QualityIssue> {
552        self.issues
553            .iter()
554            .filter(|i| i.record_id == record_id)
555            .collect()
556    }
557
558    /// Returns issues of a specific type.
559    pub fn issues_by_type(&self, issue_type: QualityIssueType) -> Vec<&QualityIssue> {
560        self.issues
561            .iter()
562            .filter(|i| i.issue_type == issue_type)
563            .collect()
564    }
565
566    /// Resets all statistics and issues.
567    pub fn reset(&mut self) {
568        self.stats = DataQualityStats::default();
569        self.issues.clear();
570        self.next_issue_id = 1;
571        self.missing_value_injector.reset_stats();
572        self.format_injector.reset_stats();
573        self.duplicate_generator.reset_stats();
574        self.typo_generator.reset_stats();
575    }
576
577    /// Updates aggregate statistics.
578    pub fn update_stats(&mut self) {
579        self.stats.missing_values = self.missing_value_injector.stats().clone();
580        self.stats.format_variations = self.format_injector.stats().clone();
581        self.stats.duplicates = self.duplicate_generator.stats().clone();
582        self.stats.typos = self.typo_generator.stats().clone();
583    }
584}
585
586/// Builder for DataQualityConfig.
587pub struct DataQualityConfigBuilder {
588    config: DataQualityConfig,
589}
590
591impl DataQualityConfigBuilder {
592    /// Creates a new builder with default configuration.
593    pub fn new() -> Self {
594        Self {
595            config: DataQualityConfig::default(),
596        }
597    }
598
599    /// Enables or disables missing values.
600    pub fn with_missing_values(mut self, enable: bool) -> Self {
601        self.config.enable_missing_values = enable;
602        self
603    }
604
605    /// Sets the global missing value rate.
606    pub fn with_missing_rate(mut self, rate: f64) -> Self {
607        self.config.missing_values.global_rate = rate;
608        self
609    }
610
611    /// Enables or disables format variations.
612    pub fn with_format_variations(mut self, enable: bool) -> Self {
613        self.config.enable_format_variations = enable;
614        self
615    }
616
617    /// Enables or disables duplicates.
618    pub fn with_duplicates(mut self, enable: bool) -> Self {
619        self.config.enable_duplicates = enable;
620        self
621    }
622
623    /// Sets the duplicate rate.
624    pub fn with_duplicate_rate(mut self, rate: f64) -> Self {
625        self.config.duplicates.duplicate_rate = rate;
626        self
627    }
628
629    /// Enables or disables typos.
630    pub fn with_typos(mut self, enable: bool) -> Self {
631        self.config.enable_typos = enable;
632        self
633    }
634
635    /// Sets the typo rate.
636    pub fn with_typo_rate(mut self, rate: f64) -> Self {
637        self.config.typos.char_error_rate = rate;
638        self
639    }
640
641    /// Enables or disables encoding issues.
642    pub fn with_encoding_issues(mut self, enable: bool) -> Self {
643        self.config.enable_encoding_issues = enable;
644        self
645    }
646
647    /// Sets the random seed.
648    pub fn with_seed(mut self, seed: u64) -> Self {
649        self.config.seed = seed;
650        self
651    }
652
653    /// Builds the configuration.
654    pub fn build(self) -> DataQualityConfig {
655        self.config
656    }
657}
658
659impl Default for DataQualityConfigBuilder {
660    fn default() -> Self {
661        Self::new()
662    }
663}
664
665#[cfg(test)]
666mod tests {
667    use super::*;
668    use rust_decimal_macros::dec;
669
670    #[test]
671    fn test_data_quality_injector_creation() {
672        let config = DataQualityConfig::default();
673        let injector = DataQualityInjector::new(config);
674
675        assert_eq!(injector.stats().total_records, 0);
676    }
677
678    #[test]
679    fn test_text_field_processing() {
680        let config = DataQualityConfigBuilder::new()
681            .with_typo_rate(0.5) // High rate for testing
682            .with_seed(42)
683            .build();
684
685        let mut injector = DataQualityInjector::new(config);
686        let context = HashMap::new();
687
688        let result =
689            injector.process_text_field("description", "Test Entry Description", "JE001", &context);
690
691        assert!(result.is_some());
692    }
693
694    #[test]
695    fn test_date_field_processing() {
696        let config = DataQualityConfigBuilder::new()
697            .with_format_variations(true)
698            .with_seed(42)
699            .build();
700
701        let mut injector = DataQualityInjector::new(config);
702        let context = HashMap::new();
703
704        let date = NaiveDate::from_ymd_opt(2024, 1, 15).unwrap();
705        let result = injector.process_date_field("posting_date", date, "JE001", &context);
706
707        assert!(result.is_some());
708    }
709
710    #[test]
711    fn test_amount_field_processing() {
712        let config = DataQualityConfigBuilder::new()
713            .with_format_variations(true)
714            .with_seed(42)
715            .build();
716
717        let mut injector = DataQualityInjector::new(config);
718        let context = HashMap::new();
719
720        let amount = dec!(1234.56);
721        let result = injector.process_amount_field("debit_amount", amount, "JE001", &context);
722
723        assert!(result.is_some());
724    }
725
726    #[test]
727    fn test_minimal_config() {
728        let config = DataQualityConfig::minimal();
729        assert!(config.missing_values.global_rate < 0.01);
730        assert!(config.typos.char_error_rate < 0.01);
731    }
732
733    #[test]
734    fn test_high_variation_config() {
735        let config = DataQualityConfig::high_variation();
736        assert!(config.missing_values.global_rate > 0.01);
737        assert!(config.enable_encoding_issues);
738    }
739}